#!/usr/bin/env python3
"""
YouTube Watch History Fetcher
=============================
Fetches Kinan's daily YouTube watch history and stores it in PostgreSQL.

Strategy:
    The YouTube Data API v3 deprecated the `watchHistory` playlist (HL) in August 2016.
    Direct programmatic access to watch history is no longer possible via the standard API.

    This module implements a multi-strategy approach:
    1. PRIMARY: YouTube Data API v3 `activities.list` (mine=True) - gets recent activity
       (uploads, likes, comments, subscriptions, etc. -- partial watch signals)
    2. SECONDARY: Google Takeout JSON parser - parses the `watch-history.json` file
       from a Google Takeout export (can be scheduled every 2 months from takeout.google.com)
    3. TERTIARY: Browser automation via Playwright to scrape myactivity.google.com
       (most complete but requires browser session)

    For daily automated use, Strategy 1 + 2 combined gives the best results.
    Strategy 3 is available as a fallback for maximum completeness.

Usage:
    # Fetch via YouTube API (activities -- partial watch signals)
    python youtube_watch_history.py --mode api

    # Parse Google Takeout export
    python youtube_watch_history.py --mode takeout --takeout-file /path/to/watch-history.json

    # Browser automation (requires playwright install)
    python youtube_watch_history.py --mode browser

    # All strategies combined
    python youtube_watch_history.py --mode all --takeout-file /path/to/watch-history.json

Author: Genesis System
Version: 1.0.0
"""

import argparse
import json
import os
import re
import sys
import logging
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple

# Add Genesis paths
sys.path.insert(0, "/mnt/e/genesis-system/data/genesis-memory")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger("yt_watch_history")

# ---------------------------------------------------------------------------
# Credentials loader
# ---------------------------------------------------------------------------

def load_credentials() -> Dict[str, str]:
    """Load Google/YouTube credentials from environment or Genesis secrets."""
    creds = {
        "google_client_id": os.environ.get("GOOGLE_CLIENT_ID", ""),
        "google_client_secret": os.environ.get("GOOGLE_CLIENT_SECRET", ""),
        "youtube_api_key": os.environ.get("YOUTUBE_API_KEY", ""),
        "google_oauth_token_path": os.environ.get(
            "GOOGLE_OAUTH_TOKEN_PATH",
            "/mnt/e/genesis-system/config/google_oauth_token.json"
        ),
        "google_credentials_path": os.environ.get(
            "GOOGLE_CREDENTIALS_PATH",
            "/mnt/e/genesis-system/config/google_client_secret.json"
        ),
    }
    # Try loading from secrets.env
    secrets_path = Path("/mnt/e/genesis-system/config/secrets.env")
    if secrets_path.exists():
        with open(secrets_path) as f:
            for line in f:
                line = line.strip()
                if "=" in line and not line.startswith("#"):
                    key, val = line.split("=", 1)
                    key = key.strip().lower()
                    val = val.strip().strip('"').strip("'")
                    if key == "google_client_id" and not creds["google_client_id"]:
                        creds["google_client_id"] = val
                    elif key == "google_client_secret" and not creds["google_client_secret"]:
                        creds["google_client_secret"] = val
                    elif key == "youtube_api_key" and not creds["youtube_api_key"]:
                        creds["youtube_api_key"] = val
    return creds


# ---------------------------------------------------------------------------
# Database layer
# ---------------------------------------------------------------------------

def get_db_connection():
    """Get PostgreSQL connection using Elestio config."""
    try:
        import psycopg2
        from elestio_config import PostgresConfig
        conn = psycopg2.connect(**PostgresConfig.get_connection_params())
        return conn
    except ImportError as e:
        logger.error(f"Missing dependency: {e}. Install with: pip install psycopg2-binary")
        raise
    except Exception as e:
        logger.error(f"Database connection failed: {e}")
        raise


def ensure_schema(conn):
    """Create the watch history tables if they don't exist."""
    with conn.cursor() as cur:
        cur.execute("""
            CREATE TABLE IF NOT EXISTS yt_watch_history (
                id SERIAL PRIMARY KEY,
                video_id VARCHAR(20) NOT NULL,
                title TEXT,
                channel_name TEXT,
                channel_id VARCHAR(50),
                watched_at TIMESTAMP WITH TIME ZONE,
                duration_seconds INTEGER,
                thumbnail_url TEXT,
                source VARCHAR(20) DEFAULT 'api',
                created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
                UNIQUE(video_id, watched_at)
            );

            CREATE INDEX IF NOT EXISTS idx_yt_watch_date ON yt_watch_history(watched_at);
            CREATE INDEX IF NOT EXISTS idx_yt_video_id ON yt_watch_history(video_id);
            CREATE INDEX IF NOT EXISTS idx_yt_source ON yt_watch_history(source);
        """)
        conn.commit()
    logger.info("Database schema verified/created.")


def insert_watch_records(conn, records: List[Dict[str, Any]]) -> int:
    """Insert watch history records, skipping duplicates. Returns count inserted."""
    if not records:
        return 0

    inserted = 0
    with conn.cursor() as cur:
        for rec in records:
            try:
                cur.execute("""
                    INSERT INTO yt_watch_history
                        (video_id, title, channel_name, channel_id,
                         watched_at, duration_seconds, thumbnail_url, source)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (video_id, watched_at) DO NOTHING
                """, (
                    rec.get("video_id", ""),
                    rec.get("title", ""),
                    rec.get("channel_name", ""),
                    rec.get("channel_id", ""),
                    rec.get("watched_at"),
                    rec.get("duration_seconds"),
                    rec.get("thumbnail_url", ""),
                    rec.get("source", "api"),
                ))
                if cur.rowcount > 0:
                    inserted += 1
            except Exception as e:
                logger.warning(f"Failed to insert record for {rec.get('video_id')}: {e}")
                conn.rollback()
                continue
        conn.commit()

    logger.info(f"Inserted {inserted}/{len(records)} records (duplicates skipped).")
    return inserted


def get_todays_video_ids(conn, date: Optional[datetime] = None) -> List[str]:
    """Get all video IDs watched on a given date."""
    if date is None:
        date = datetime.now(timezone.utc)

    start = date.replace(hour=0, minute=0, second=0, microsecond=0)
    end = start + timedelta(days=1)

    with conn.cursor() as cur:
        cur.execute("""
            SELECT DISTINCT video_id FROM yt_watch_history
            WHERE watched_at >= %s AND watched_at < %s
            ORDER BY watched_at
        """, (start, end))
        return [row[0] for row in cur.fetchall()]


# ---------------------------------------------------------------------------
# Strategy 1: YouTube Data API v3 activities
# ---------------------------------------------------------------------------

def fetch_via_api(creds: Dict[str, str], days_back: int = 1) -> List[Dict[str, Any]]:
    """
    Fetch recent YouTube activity via the Data API v3.

    NOTE: The activities.list endpoint returns actions the user performed
    (likes, uploads, comments, playlists, subscriptions) but NOT passive
    watch history. It gives partial watch signals.

    For full watch history, use the Takeout or Browser strategies.
    """
    try:
        from google.oauth2.credentials import Credentials
        from google_auth_oauthlib.flow import InstalledAppFlow
        from google.auth.transport.requests import Request
        from googleapiclient.discovery import build
    except ImportError:
        logger.error(
            "Google API libraries required. Install with:\n"
            "  pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib"
        )
        return []

    SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]
    token_path = Path(creds["google_oauth_token_path"])
    client_secrets_path = Path(creds["google_credentials_path"])

    credentials = None

    # Load cached token
    if token_path.exists():
        credentials = Credentials.from_authorized_user_file(str(token_path), SCOPES)

    # Refresh or re-authenticate
    if not credentials or not credentials.valid:
        if credentials and credentials.expired and credentials.refresh_token:
            try:
                credentials.refresh(Request())
            except Exception:
                credentials = None

        if not credentials:
            if not client_secrets_path.exists():
                logger.error(
                    f"OAuth client secrets file not found at: {client_secrets_path}\n"
                    "To set up:\n"
                    "1. Go to https://console.cloud.google.com/apis/credentials\n"
                    "2. Create an OAuth 2.0 Client ID (Desktop Application)\n"
                    "3. Download the JSON and save it to the path above\n"
                    "4. Enable the YouTube Data API v3 in the API Library"
                )
                return []

            flow = InstalledAppFlow.from_client_secrets_file(
                str(client_secrets_path), SCOPES
            )
            credentials = flow.run_local_server(port=0)

        # Save token for next run
        token_path.parent.mkdir(parents=True, exist_ok=True)
        with open(token_path, "w") as f:
            f.write(credentials.to_json())
        logger.info(f"OAuth token saved to {token_path}")

    # Build YouTube API client
    youtube = build("youtube", "v3", credentials=credentials)

    # Fetch activities
    published_after = (
        datetime.now(timezone.utc) - timedelta(days=days_back)
    ).isoformat()

    records = []
    next_page = None

    while True:
        request = youtube.activities().list(
            part="snippet,contentDetails",
            mine=True,
            maxResults=50,
            publishedAfter=published_after,
            pageToken=next_page,
        )
        response = request.execute()

        for item in response.get("items", []):
            snippet = item.get("snippet", {})
            content_details = item.get("contentDetails", {})

            # Extract video ID from various activity types
            video_id = None
            activity_type = snippet.get("type", "")

            if activity_type == "upload":
                video_id = content_details.get("upload", {}).get("videoId")
            elif activity_type == "like":
                video_id = content_details.get("like", {}).get("resourceId", {}).get("videoId")
            elif activity_type == "favorite":
                video_id = content_details.get("favorite", {}).get("resourceId", {}).get("videoId")
            elif activity_type == "playlistItem":
                video_id = content_details.get("playlistItem", {}).get("resourceId", {}).get("videoId")
            elif activity_type == "recommendation":
                video_id = content_details.get("recommendation", {}).get("resourceId", {}).get("videoId")

            if not video_id:
                continue

            # Get video details (duration, etc.) via videos.list
            thumbnails = snippet.get("thumbnails", {})
            thumb_url = (
                thumbnails.get("high", {}).get("url")
                or thumbnails.get("medium", {}).get("url")
                or thumbnails.get("default", {}).get("url", "")
            )

            records.append({
                "video_id": video_id,
                "title": snippet.get("title", ""),
                "channel_name": snippet.get("channelTitle", ""),
                "channel_id": snippet.get("channelId", ""),
                "watched_at": snippet.get("publishedAt"),
                "duration_seconds": None,  # Enriched later
                "thumbnail_url": thumb_url,
                "source": "api_activity",
            })

        next_page = response.get("nextPageToken")
        if not next_page:
            break

    # Enrich with video durations
    if records:
        records = _enrich_video_details(youtube, records)

    logger.info(f"API strategy: fetched {len(records)} activity records.")
    return records


def _enrich_video_details(youtube, records: List[Dict]) -> List[Dict]:
    """Batch-fetch video details (duration, actual title) for a list of records."""
    video_ids = list({r["video_id"] for r in records})

    details_map = {}
    # YouTube API allows max 50 IDs per request
    for i in range(0, len(video_ids), 50):
        batch = video_ids[i:i+50]
        try:
            response = youtube.videos().list(
                part="contentDetails,snippet",
                id=",".join(batch)
            ).execute()

            for item in response.get("items", []):
                vid = item["id"]
                duration_str = item.get("contentDetails", {}).get("duration", "")
                details_map[vid] = {
                    "duration_seconds": _parse_iso8601_duration(duration_str),
                    "title": item.get("snippet", {}).get("title", ""),
                    "channel_name": item.get("snippet", {}).get("channelTitle", ""),
                }
        except Exception as e:
            logger.warning(f"Failed to enrich video batch: {e}")

    for rec in records:
        vid = rec["video_id"]
        if vid in details_map:
            if not rec.get("title"):
                rec["title"] = details_map[vid]["title"]
            if not rec.get("channel_name"):
                rec["channel_name"] = details_map[vid]["channel_name"]
            rec["duration_seconds"] = details_map[vid]["duration_seconds"]

    return records


def _parse_iso8601_duration(duration: str) -> Optional[int]:
    """Parse ISO 8601 duration (PT1H2M3S) to seconds."""
    if not duration:
        return None
    match = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", duration)
    if not match:
        return None
    hours = int(match.group(1) or 0)
    minutes = int(match.group(2) or 0)
    seconds = int(match.group(3) or 0)
    return hours * 3600 + minutes * 60 + seconds


# ---------------------------------------------------------------------------
# Strategy 2: Google Takeout JSON parser
# ---------------------------------------------------------------------------

def fetch_via_takeout(
    takeout_path: str,
    days_back: int = 1,
    all_history: bool = False
) -> List[Dict[str, Any]]:
    """
    Parse YouTube watch history from a Google Takeout JSON export.

    To get this file:
    1. Go to https://takeout.google.com
    2. Deselect all, then select only "YouTube and YouTube Music"
    3. Under "YouTube and YouTube Music", click "All YouTube data included"
    4. Select only "history" -> "watch-history.json"
    5. Choose JSON format (not HTML)
    6. Export and download
    7. Extract watch-history.json from the archive

    The file can also be set up for scheduled exports (every 2 months).
    """
    path = Path(takeout_path)
    if not path.exists():
        logger.error(f"Takeout file not found: {takeout_path}")
        return []

    with open(path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            logger.error(f"Invalid JSON in takeout file: {takeout_path}")
            return []

    if not isinstance(data, list):
        logger.error("Unexpected takeout format -- expected a JSON array.")
        return []

    cutoff = None
    if not all_history:
        cutoff = datetime.now(timezone.utc) - timedelta(days=days_back)

    records = []
    for entry in data:
        # Parse timestamp
        time_str = entry.get("time", "")
        try:
            watched_at = datetime.fromisoformat(time_str.replace("Z", "+00:00"))
        except (ValueError, AttributeError):
            continue

        if cutoff and watched_at < cutoff:
            continue

        # Extract video ID from titleUrl
        title_url = entry.get("titleUrl", "")
        video_id = _extract_video_id(title_url)
        if not video_id:
            continue

        # Channel info
        subtitles = entry.get("subtitles", [])
        channel_name = ""
        channel_url = ""
        if subtitles:
            channel_name = subtitles[0].get("name", "")
            channel_url = subtitles[0].get("url", "")

        channel_id = ""
        if channel_url:
            # Extract channel ID from URL like https://www.youtube.com/channel/UCxxxx
            ch_match = re.search(r"channel/([a-zA-Z0-9_-]+)", channel_url)
            if ch_match:
                channel_id = ch_match.group(1)

        title = entry.get("title", "")
        # Remove "Watched " prefix that Takeout adds
        if title.startswith("Watched "):
            title = title[8:]

        records.append({
            "video_id": video_id,
            "title": title,
            "channel_name": channel_name,
            "channel_id": channel_id,
            "watched_at": watched_at.isoformat(),
            "duration_seconds": None,  # Not in Takeout data
            "thumbnail_url": f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg",
            "source": "takeout",
        })

    logger.info(f"Takeout strategy: parsed {len(records)} records from {takeout_path}")
    return records


def _extract_video_id(url: str) -> Optional[str]:
    """Extract YouTube video ID from various URL formats."""
    if not url:
        return None
    patterns = [
        r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
        r"youtube\.com/v/([a-zA-Z0-9_-]{11})",
        r"youtube\.com/shorts/([a-zA-Z0-9_-]{11})",
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None


# ---------------------------------------------------------------------------
# Strategy 3: Browser automation (Playwright)
# ---------------------------------------------------------------------------

def fetch_via_browser(days_back: int = 1) -> List[Dict[str, Any]]:
    """
    Scrape YouTube watch history via browser automation.

    Requires: pip install playwright && playwright install chromium

    This opens a browser, navigates to myactivity.google.com/product/youtube,
    and extracts the watch history entries.

    NOTE: First run requires manual Google login. After that, the browser
    profile is cached for automatic re-authentication.
    """
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
        logger.error(
            "Playwright required for browser strategy.\n"
            "Install with: pip install playwright && playwright install chromium"
        )
        return []

    profile_dir = "/mnt/e/genesis-system/config/browser_profile"
    Path(profile_dir).mkdir(parents=True, exist_ok=True)

    records = []
    cutoff = datetime.now(timezone.utc) - timedelta(days=days_back)

    with sync_playwright() as p:
        browser = p.chromium.launch_persistent_context(
            user_data_dir=profile_dir,
            headless=False,  # Must be False for first-time login
            args=["--disable-blink-features=AutomationControlled"],
        )
        page = browser.pages[0] if browser.pages else browser.new_page()

        # Navigate to YouTube history
        logger.info("Navigating to YouTube history page...")
        page.goto("https://myactivity.google.com/product/youtube?hl=en", timeout=60000)

        # Wait for content to load
        page.wait_for_timeout(5000)

        # Check if we need to login
        if "accounts.google.com" in page.url:
            logger.warning(
                "Google login required. Please log in manually in the browser window.\n"
                "The browser profile will be saved for future automated runs."
            )
            # Wait for user to complete login (up to 5 minutes)
            page.wait_for_url("**/myactivity.google.com/**", timeout=300000)
            page.wait_for_timeout(3000)

        # Scroll to load more entries
        logger.info("Loading watch history entries...")
        for _ in range(10):  # Scroll 10 times to load more
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            page.wait_for_timeout(1500)

        # Extract entries
        # MyActivity uses a specific DOM structure with activity cards
        entries = page.query_selector_all('[data-date]')
        if not entries:
            # Fallback: try to find activity items by common patterns
            entries = page.query_selector_all('.fp-display-text')

        logger.info(f"Found {len(entries)} raw entries on page.")

        # Parse entries (structure varies, this is best-effort)
        all_links = page.query_selector_all('a[href*="youtube.com/watch"]')
        for link in all_links:
            href = link.get_attribute("href") or ""
            video_id = _extract_video_id(href)
            if not video_id:
                continue

            title = link.inner_text().strip()

            # Try to find associated timestamp
            parent = link.evaluate_handle("el => el.closest('[data-date]')")
            watched_str = None
            try:
                watched_str = parent.get_attribute("data-date")
            except Exception:
                pass

            watched_at = None
            if watched_str:
                try:
                    watched_at = datetime.fromisoformat(watched_str.replace("Z", "+00:00"))
                except (ValueError, AttributeError):
                    pass

            if watched_at and watched_at < cutoff:
                continue

            records.append({
                "video_id": video_id,
                "title": title,
                "channel_name": "",
                "channel_id": "",
                "watched_at": watched_at.isoformat() if watched_at else datetime.now(timezone.utc).isoformat(),
                "duration_seconds": None,
                "thumbnail_url": f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg",
                "source": "browser",
            })

        browser.close()

    logger.info(f"Browser strategy: scraped {len(records)} watch entries.")
    return records


# ---------------------------------------------------------------------------
# Main orchestrator
# ---------------------------------------------------------------------------

def run(
    mode: str = "takeout",
    takeout_file: Optional[str] = None,
    days_back: int = 1,
    all_history: bool = False,
    dry_run: bool = False,
) -> Tuple[int, List[str]]:
    """
    Run the watch history fetcher.

    Returns:
        Tuple of (records_inserted, video_ids)
    """
    creds = load_credentials()
    all_records = []

    if mode in ("api", "all"):
        api_records = fetch_via_api(creds, days_back=days_back)
        all_records.extend(api_records)

    if mode in ("takeout", "all"):
        if not takeout_file:
            # Check default location
            default_paths = [
                "/mnt/e/genesis-system/data/youtube_takeout/watch-history.json",
                "/mnt/e/genesis-system/config/watch-history.json",
            ]
            for dp in default_paths:
                if Path(dp).exists():
                    takeout_file = dp
                    break

        if takeout_file:
            takeout_records = fetch_via_takeout(
                takeout_file, days_back=days_back, all_history=all_history
            )
            all_records.extend(takeout_records)
        else:
            logger.warning("No takeout file specified or found at default locations.")

    if mode in ("browser", "all"):
        browser_records = fetch_via_browser(days_back=days_back)
        all_records.extend(browser_records)

    # Deduplicate by video_id + watched_at
    seen = set()
    unique_records = []
    for rec in all_records:
        key = (rec["video_id"], rec.get("watched_at", ""))
        if key not in seen:
            seen.add(key)
            unique_records.append(rec)

    logger.info(f"Total unique records: {len(unique_records)}")

    if dry_run:
        for rec in unique_records:
            logger.info(f"  [{rec['source']}] {rec['video_id']} - {rec['title']}")
        return 0, [r["video_id"] for r in unique_records]

    # Store in PostgreSQL
    conn = get_db_connection()
    try:
        ensure_schema(conn)
        inserted = insert_watch_records(conn, unique_records)

        # Get today's video IDs for downstream pipeline
        video_ids = get_todays_video_ids(conn)
        logger.info(f"Today's video IDs for transcript extraction: {len(video_ids)}")

        return inserted, video_ids
    finally:
        conn.close()


def main():
    parser = argparse.ArgumentParser(
        description="Fetch YouTube watch history for Genesis memory pipeline"
    )
    parser.add_argument(
        "--mode",
        choices=["api", "takeout", "browser", "all"],
        default="takeout",
        help="Fetching strategy: api (YT activities), takeout (Google Takeout JSON), "
             "browser (Playwright), all (combined)"
    )
    parser.add_argument(
        "--takeout-file",
        help="Path to Google Takeout watch-history.json file"
    )
    parser.add_argument(
        "--days-back",
        type=int,
        default=1,
        help="Number of days of history to fetch (default: 1)"
    )
    parser.add_argument(
        "--all-history",
        action="store_true",
        help="Import ALL history from takeout file (ignores --days-back for takeout)"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print records without storing to database"
    )

    args = parser.parse_args()

    inserted, video_ids = run(
        mode=args.mode,
        takeout_file=args.takeout_file,
        days_back=args.days_back,
        all_history=args.all_history,
        dry_run=args.dry_run,
    )

    # Output video IDs for pipeline chaining
    output = {
        "inserted": inserted,
        "video_ids": video_ids,
        "count": len(video_ids),
        "timestamp": datetime.now(timezone.utc).isoformat(),
    }
    print(json.dumps(output, indent=2))


if __name__ == "__main__":
    main()
