#!/usr/bin/env python3
"""
YouTube Watchlist Scraper
==========================
Uses Playwright to scrape YouTube Watch History and Watch Later playlist
from an authenticated browser session (sunvision07@gmail.com).

This is the BROWSER-FIRST strategy - it expects the browser to already be
authenticated. It works with the Genesis Collaborative Browser Handoff protocol:
  1. Human logs into sunvision07@gmail.com in Chrome
  2. This script attaches to that session (or uses persistent profile)
  3. Scrapes Watch History + Watch Later
  4. Persists to PostgreSQL youtube_intel table

Usage:
    # Scrape both Watch History and Watch Later (default)
    python youtube_watchlist_scraper.py

    # Scrape only Watch History (recent 50 videos)
    python youtube_watchlist_scraper.py --source history --limit 50

    # Scrape only Watch Later
    python youtube_watchlist_scraper.py --source watchlater

    # Use a specific browser profile directory (for persistent auth)
    python youtube_watchlist_scraper.py --profile-dir "E:/genesis-system/config/browser_profile_yt"

    # Output to JSON file only (no DB write)
    python youtube_watchlist_scraper.py --dry-run --output-file scraped_videos.json

    # Connect to existing Chrome session via CDP
    python youtube_watchlist_scraper.py --cdp-url http://localhost:9222

Author: Genesis System
Version: 1.0.0
Date: 2026-02-23
"""

import argparse
import json
import logging
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional

# ---------------------------------------------------------------------------
# Genesis path setup
# ---------------------------------------------------------------------------
GENESIS_ROOT = Path("E:/genesis-system")
sys.path.insert(0, str(GENESIS_ROOT / "data" / "genesis-memory"))

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("yt_watchlist_scraper")

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
YOUTUBE_HISTORY_URL = "https://www.youtube.com/feed/history"
YOUTUBE_WATCH_LATER_URL = "https://www.youtube.com/playlist?list=WL"
DEFAULT_PROFILE_DIR = str(GENESIS_ROOT / "config" / "browser_profile_yt")
DEFAULT_HISTORY_LIMIT = 100
DEFAULT_WL_LIMIT = 200

# ---------------------------------------------------------------------------
# Database layer
# ---------------------------------------------------------------------------

def get_db_connection():
    """Connect to Elestio PostgreSQL."""
    import psycopg2
    from elestio_config import PostgresConfig
    return psycopg2.connect(**PostgresConfig.get_connection_params())


def ensure_youtube_intel_schema(conn) -> None:
    """Create youtube_intel table if it does not exist."""
    sql = """
        CREATE TABLE IF NOT EXISTS youtube_intel (
            id            SERIAL PRIMARY KEY,
            video_id      VARCHAR(20)  NOT NULL,
            title         TEXT         NOT NULL,
            channel_name  TEXT,
            channel_url   TEXT,
            video_url     TEXT,
            source        VARCHAR(30)  NOT NULL,  -- 'watch_history' | 'watch_later'
            position      INTEGER,                -- order in list (1 = most recent/top)
            thumbnail_url TEXT,
            duration_text TEXT,                   -- raw text like "12:34" or "1:02:45"
            scraped_at    TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
            transcript_ingested BOOLEAN DEFAULT FALSE,
            UNIQUE (video_id, source)
        );

        CREATE INDEX IF NOT EXISTS idx_yt_intel_video_id
            ON youtube_intel (video_id);
        CREATE INDEX IF NOT EXISTS idx_yt_intel_source
            ON youtube_intel (source);
        CREATE INDEX IF NOT EXISTS idx_yt_intel_scraped_at
            ON youtube_intel (scraped_at);
        CREATE INDEX IF NOT EXISTS idx_yt_intel_ingested
            ON youtube_intel (transcript_ingested);
    """
    with conn.cursor() as cur:
        cur.execute(sql)
    conn.commit()
    logger.info("youtube_intel schema ready.")


def upsert_video_records(conn, records: List[Dict[str, Any]]) -> int:
    """Upsert video records into youtube_intel. Returns count of new rows."""
    if not records:
        return 0

    sql = """
        INSERT INTO youtube_intel
            (video_id, title, channel_name, channel_url, video_url,
             source, position, thumbnail_url, duration_text)
        VALUES
            (%(video_id)s, %(title)s, %(channel_name)s, %(channel_url)s,
             %(video_url)s, %(source)s, %(position)s, %(thumbnail_url)s,
             %(duration_text)s)
        ON CONFLICT (video_id, source) DO UPDATE SET
            title         = EXCLUDED.title,
            channel_name  = EXCLUDED.channel_name,
            position      = EXCLUDED.position,
            scraped_at    = NOW()
        RETURNING (xmax = 0) AS is_new
    """

    new_count = 0
    with conn.cursor() as cur:
        for rec in records:
            try:
                cur.execute(sql, rec)
                row = cur.fetchone()
                if row and row[0]:
                    new_count += 1
            except Exception as e:
                logger.warning(f"Upsert failed for {rec.get('video_id')}: {e}")
                conn.rollback()
                continue
    conn.commit()
    logger.info(f"Upserted {len(records)} records ({new_count} new).")
    return new_count


def get_pending_video_ids(conn, source: Optional[str] = None, limit: int = 500) -> List[Dict]:
    """Return video records not yet ingested, ordered by position (priority first)."""
    conditions = ["transcript_ingested = FALSE"]
    params: List[Any] = []

    if source:
        conditions.append("source = %s")
        params.append(source)

    where = " AND ".join(conditions)
    sql = f"""
        SELECT video_id, title, channel_name, source, position, video_url
        FROM youtube_intel
        WHERE {where}
        ORDER BY
            CASE source WHEN 'watch_later' THEN 0 ELSE 1 END,
            position ASC NULLS LAST
        LIMIT %s
    """
    params.append(limit)

    with conn.cursor() as cur:
        cur.execute(sql, params)
        cols = [d[0] for d in cur.description]
        return [dict(zip(cols, row)) for row in cur.fetchall()]


# ---------------------------------------------------------------------------
# URL helpers
# ---------------------------------------------------------------------------

def extract_video_id(url: str) -> Optional[str]:
    """Extract 11-char YouTube video ID from any URL format."""
    if not url:
        return None
    patterns = [
        r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
        r"youtube\.com/v/([a-zA-Z0-9_-]{11})",
        r"youtube\.com/shorts/([a-zA-Z0-9_-]{11})",
    ]
    for pat in patterns:
        m = re.search(pat, url)
        if m:
            return m.group(1)
    return None


def build_video_url(video_id: str) -> str:
    return f"https://www.youtube.com/watch?v={video_id}"


def build_thumbnail_url(video_id: str) -> str:
    return f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"


# ---------------------------------------------------------------------------
# Playwright scraping helpers
# ---------------------------------------------------------------------------

def _scroll_page(page, times: int = 5, delay_ms: int = 1800) -> None:
    """Scroll the page multiple times to trigger lazy-loading."""
    for i in range(times):
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(delay_ms)
        logger.debug(f"Scroll {i+1}/{times} complete.")


def _wait_for_youtube_content(page, timeout_ms: int = 15000) -> bool:
    """Wait until YouTube video renderer elements appear."""
    try:
        page.wait_for_selector(
            "ytd-video-renderer, ytd-playlist-video-renderer, ytd-compact-video-renderer",
            timeout=timeout_ms,
        )
        return True
    except Exception:
        logger.warning("Timed out waiting for YouTube video renderers.")
        return False


def _check_logged_in(page) -> bool:
    """Return True if the page is showing a logged-in YouTube state."""
    # If redirected to accounts.google.com, we're logged out
    if "accounts.google.com" in page.url:
        return False
    # Try to find the avatar button (logged-in indicator)
    avatar = page.query_selector("button#avatar-btn, yt-img-shadow#avatar")
    return avatar is not None


# ---------------------------------------------------------------------------
# Watch History scraper
# ---------------------------------------------------------------------------

def scrape_watch_history(page, limit: int = DEFAULT_HISTORY_LIMIT) -> List[Dict[str, Any]]:
    """
    Scrape YouTube Watch History from youtube.com/feed/history.

    Returns list of video dicts.
    """
    logger.info(f"Navigating to Watch History (limit={limit})...")
    page.goto(YOUTUBE_HISTORY_URL, wait_until="domcontentloaded", timeout=30000)
    page.wait_for_timeout(3000)

    if not _check_logged_in(page):
        logger.error(
            "Not logged in. Please authenticate in the browser first.\n"
            "Use --profile-dir with a directory where sunvision07@gmail.com is logged in."
        )
        return []

    # Check for 'history is off' message
    history_off = page.query_selector("yt-empty-state-renderer, #empty-state")
    if history_off:
        msg = history_off.inner_text() if hasattr(history_off, 'inner_text') else ""
        if "history is off" in msg.lower() or "paused" in msg.lower():
            logger.warning("Watch history appears to be paused. Enable it at myaccount.google.com.")

    # Scroll enough to load 'limit' videos (each scroll loads ~20-30)
    scrolls_needed = max(3, limit // 25 + 2)
    _scroll_page(page, times=scrolls_needed, delay_ms=1500)

    # Wait for renderers
    _wait_for_youtube_content(page)

    # Extract all video-renderer elements
    records: List[Dict[str, Any]] = []
    renderers = page.query_selector_all("ytd-video-renderer")

    logger.info(f"Found {len(renderers)} video renderers on Watch History page.")

    for position, renderer in enumerate(renderers[:limit], start=1):
        try:
            # Video link + ID
            link_el = renderer.query_selector("a#video-title")
            if not link_el:
                link_el = renderer.query_selector("a.ytd-video-renderer")
            if not link_el:
                continue

            href = link_el.get_attribute("href") or ""
            video_id = extract_video_id(href)
            if not video_id:
                # Try data-video-id attribute
                video_id = renderer.get_attribute("data-video-id")
            if not video_id:
                continue

            title = (link_el.get_attribute("title") or link_el.inner_text() or "").strip()

            # Channel name
            channel_name = ""
            channel_url_raw = ""
            channel_el = renderer.query_selector(
                "ytd-channel-name a, .ytd-channel-name a, a.yt-simple-endpoint[href*='/channel/'], "
                "a.yt-simple-endpoint[href*='/@']"
            )
            if channel_el:
                channel_name = channel_el.inner_text().strip()
                channel_url_raw = channel_el.get_attribute("href") or ""

            # Thumbnail
            thumb_el = renderer.query_selector("img.yt-img-shadow, ytd-thumbnail img")
            thumbnail_url = ""
            if thumb_el:
                thumbnail_url = (
                    thumb_el.get_attribute("src")
                    or thumb_el.get_attribute("data-thumb")
                    or build_thumbnail_url(video_id)
                )
            if not thumbnail_url or thumbnail_url.startswith("data:"):
                thumbnail_url = build_thumbnail_url(video_id)

            # Duration badge
            duration_text = ""
            duration_el = renderer.query_selector(
                "span.ytd-thumbnail-overlay-time-status-renderer, "
                "ytd-thumbnail-overlay-time-status-renderer span"
            )
            if duration_el:
                duration_text = duration_el.inner_text().strip()

            records.append({
                "video_id": video_id,
                "title": title,
                "channel_name": channel_name,
                "channel_url": channel_url_raw,
                "video_url": build_video_url(video_id),
                "source": "watch_history",
                "position": position,
                "thumbnail_url": thumbnail_url,
                "duration_text": duration_text,
            })

        except Exception as e:
            logger.debug(f"Error parsing renderer at position {position}: {e}")
            continue

    logger.info(f"Scraped {len(records)} videos from Watch History.")
    return records


# ---------------------------------------------------------------------------
# Watch Later scraper
# ---------------------------------------------------------------------------

def scrape_watch_later(page, limit: int = DEFAULT_WL_LIMIT) -> List[Dict[str, Any]]:
    """
    Scrape YouTube Watch Later playlist from youtube.com/playlist?list=WL.

    Returns list of video dicts, position 1 = top of queue (highest priority).
    """
    logger.info(f"Navigating to Watch Later playlist (limit={limit})...")
    page.goto(YOUTUBE_WATCH_LATER_URL, wait_until="domcontentloaded", timeout=30000)
    page.wait_for_timeout(3000)

    if not _check_logged_in(page):
        logger.error("Not logged in. Cannot access Watch Later playlist.")
        return []

    # Watch Later may show an empty state or a private playlist message
    empty_el = page.query_selector("yt-empty-state-renderer")
    if empty_el:
        logger.info("Watch Later playlist appears to be empty.")
        return []

    # Scroll to load all items
    scrolls_needed = max(3, limit // 20 + 2)
    _scroll_page(page, times=scrolls_needed, delay_ms=1500)

    # Wait for playlist renderers
    try:
        page.wait_for_selector("ytd-playlist-video-renderer", timeout=10000)
    except Exception:
        logger.warning("No playlist video renderers found. Watch Later may be empty.")
        return []

    renderers = page.query_selector_all("ytd-playlist-video-renderer")
    logger.info(f"Found {len(renderers)} video renderers in Watch Later playlist.")

    records: List[Dict[str, Any]] = []

    for position, renderer in enumerate(renderers[:limit], start=1):
        try:
            # Title link
            title_el = renderer.query_selector("a#video-title")
            if not title_el:
                continue

            href = title_el.get_attribute("href") or ""
            video_id = extract_video_id(href)
            if not video_id:
                # Strip playlist params from URL
                clean_href = href.split("&")[0] if "&" in href else href
                video_id = extract_video_id(clean_href)
            if not video_id:
                continue

            title = (title_el.get_attribute("title") or title_el.inner_text() or "").strip()

            # Channel name
            channel_name = ""
            channel_url_raw = ""
            channel_el = renderer.query_selector(
                "ytd-channel-name a, .ytd-channel-name a, "
                "a.yt-simple-endpoint[href*='/channel/'], a.yt-simple-endpoint[href*='/@']"
            )
            if channel_el:
                channel_name = channel_el.inner_text().strip()
                channel_url_raw = channel_el.get_attribute("href") or ""

            # Thumbnail
            thumb_el = renderer.query_selector("img.yt-img-shadow, ytd-thumbnail img")
            thumbnail_url = build_thumbnail_url(video_id)
            if thumb_el:
                src = thumb_el.get_attribute("src") or ""
                if src and not src.startswith("data:"):
                    thumbnail_url = src

            # Duration
            duration_text = ""
            dur_el = renderer.query_selector(
                "span.ytd-thumbnail-overlay-time-status-renderer, "
                "ytd-thumbnail-overlay-time-status-renderer span"
            )
            if dur_el:
                duration_text = dur_el.inner_text().strip()

            records.append({
                "video_id": video_id,
                "title": title,
                "channel_name": channel_name,
                "channel_url": channel_url_raw,
                "video_url": build_video_url(video_id),
                "source": "watch_later",
                "position": position,
                "thumbnail_url": thumbnail_url,
                "duration_text": duration_text,
            })

        except Exception as e:
            logger.debug(f"Error parsing WL renderer at position {position}: {e}")
            continue

    logger.info(f"Scraped {len(records)} videos from Watch Later.")
    return records


# ---------------------------------------------------------------------------
# Main runner
# ---------------------------------------------------------------------------

def run_scraper(
    sources: List[str],
    history_limit: int = DEFAULT_HISTORY_LIMIT,
    wl_limit: int = DEFAULT_WL_LIMIT,
    profile_dir: str = DEFAULT_PROFILE_DIR,
    cdp_url: Optional[str] = None,
    dry_run: bool = False,
    output_file: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Run the watchlist scraper and persist results.

    Args:
        sources:       List of 'history' and/or 'watchlater'
        history_limit: Max videos to scrape from Watch History
        wl_limit:      Max videos to scrape from Watch Later
        profile_dir:   Chromium persistent profile with Google login
        cdp_url:       CDP URL for connecting to existing Chrome (e.g. http://localhost:9222)
        dry_run:       If True, print results without storing to DB
        output_file:   Optional path to save JSON output

    Returns:
        Summary dict
    """
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        logger.error(
            "Playwright is required.\n"
            "Install with: pip install playwright && playwright install chromium"
        )
        sys.exit(1)

    # Ensure profile directory exists
    Path(profile_dir).mkdir(parents=True, exist_ok=True)

    all_records: List[Dict[str, Any]] = []

    with sync_playwright() as p:
        if cdp_url:
            # Connect to existing Chrome browser (CDP)
            logger.info(f"Connecting to existing Chrome via CDP: {cdp_url}")
            try:
                browser = p.chromium.connect_over_cdp(cdp_url)
                context = browser.contexts[0] if browser.contexts else browser.new_context()
                page = context.pages[0] if context.pages else context.new_page()
            except Exception as e:
                logger.error(f"CDP connection failed: {e}")
                logger.info("Falling back to persistent profile launch...")
                cdp_url = None

        if not cdp_url:
            # Launch with persistent profile (carries Google session cookies)
            logger.info(f"Launching Chromium with profile: {profile_dir}")
            context = p.chromium.launch_persistent_context(
                user_data_dir=profile_dir,
                headless=False,  # Must be False to keep Google session valid
                args=[
                    "--disable-blink-features=AutomationControlled",
                    "--disable-dev-shm-usage",
                    "--no-sandbox",
                ],
                viewport={"width": 1280, "height": 800},
            )
            page = context.pages[0] if context.pages else context.new_page()

        try:
            # Check authentication state
            logger.info("Checking authentication state...")
            page.goto("https://www.youtube.com", wait_until="domcontentloaded", timeout=20000)
            page.wait_for_timeout(2000)

            if not _check_logged_in(page):
                logger.warning(
                    "Browser is not logged in to Google.\n"
                    "Please log in manually and re-run, or use --profile-dir with "
                    "a directory where you are already logged in as sunvision07@gmail.com."
                )
                # Don't exit — let the caller handle this

            # Scrape requested sources
            if "history" in sources:
                history_records = scrape_watch_history(page, limit=history_limit)
                all_records.extend(history_records)
                logger.info(f"Watch History: {len(history_records)} videos scraped.")

            if "watchlater" in sources:
                wl_records = scrape_watch_later(page, limit=wl_limit)
                all_records.extend(wl_records)
                logger.info(f"Watch Later: {len(wl_records)} videos scraped.")

        finally:
            # Close context (but not cdp-connected browser)
            if not cdp_url:
                context.close()
            logger.info("Browser context closed.")

    # Report
    total = len(all_records)
    logger.info(f"Total videos scraped: {total}")

    # Optionally save JSON
    if output_file:
        out_path = Path(output_file)
        out_path.parent.mkdir(parents=True, exist_ok=True)
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(all_records, f, indent=2, ensure_ascii=False, default=str)
        logger.info(f"Saved {total} records to {out_path}")

    # Persist to PostgreSQL
    new_count = 0
    if not dry_run and all_records:
        try:
            conn = get_db_connection()
            ensure_youtube_intel_schema(conn)
            new_count = upsert_video_records(conn, all_records)
            conn.close()
        except Exception as e:
            logger.error(f"Database write failed: {e}")

    # Build summary
    by_source: Dict[str, int] = {}
    for rec in all_records:
        by_source[rec["source"]] = by_source.get(rec["source"], 0) + 1

    summary = {
        "total_scraped": total,
        "new_to_db": new_count,
        "by_source": by_source,
        "video_ids": [r["video_id"] for r in all_records],
        "scraped_at": datetime.now(timezone.utc).isoformat(),
        "dry_run": dry_run,
    }

    print(json.dumps(summary, indent=2))
    return summary


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description=(
            "Scrape YouTube Watch History and Watch Later using Playwright.\n"
            "Requires browser already authenticated as sunvision07@gmail.com."
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "--source",
        choices=["history", "watchlater", "both"],
        default="both",
        help="Which list(s) to scrape (default: both)",
    )
    parser.add_argument(
        "--history-limit",
        type=int,
        default=DEFAULT_HISTORY_LIMIT,
        help=f"Max Watch History videos to scrape (default: {DEFAULT_HISTORY_LIMIT})",
    )
    parser.add_argument(
        "--wl-limit",
        type=int,
        default=DEFAULT_WL_LIMIT,
        help=f"Max Watch Later videos to scrape (default: {DEFAULT_WL_LIMIT})",
    )
    parser.add_argument(
        "--profile-dir",
        default=DEFAULT_PROFILE_DIR,
        help=f"Chromium persistent profile directory (default: {DEFAULT_PROFILE_DIR})",
    )
    parser.add_argument(
        "--cdp-url",
        help="Connect to existing Chrome via CDP (e.g. http://localhost:9222)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Scrape but do not write to database",
    )
    parser.add_argument(
        "--output-file",
        help="Save scraped records to this JSON file",
    )

    args = parser.parse_args()

    # Map --source to list
    if args.source == "both":
        sources = ["history", "watchlater"]
    else:
        sources = [args.source]

    run_scraper(
        sources=sources,
        history_limit=args.history_limit,
        wl_limit=args.wl_limit,
        profile_dir=args.profile_dir,
        cdp_url=args.cdp_url,
        dry_run=args.dry_run,
        output_file=args.output_file,
    )


if __name__ == "__main__":
    main()
