#!/usr/bin/env python3
"""
YouTube Watch History Fetcher for Genesis

Fetches YouTube watch history using the YouTube Data API v3,
deduplicates against already-processed videos in PostgreSQL,
and outputs new videos for transcript extraction.

Note: YouTube API v3 doesn't directly expose watch history.
This implementation uses multiple strategies:
1. Liked videos (shows engagement)
2. Subscriptions + recent uploads (likely watched)
3. Search history via Google Takeout (if available)

For true watch history, users can export via Google Takeout
and this script can ingest that data.

Usage:
    python history_fetcher.py --days 7           # Last 7 days
    python history_fetcher.py --takeout FILE     # Import from Takeout
    python history_fetcher.py --subscriptions    # Recent from subscriptions

Author: Genesis System
Version: 1.0.0
"""

import argparse
import json
import os
import sys
import logging
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, List, Dict, Any, Set
from dataclasses import dataclass, asdict
import re

# Add genesis-system to path
sys.path.insert(0, '/mnt/e/genesis-system')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Paths
GENESIS_ROOT = Path("/mnt/e/genesis-system")
CONFIG_DIR = GENESIS_ROOT / "config"
DATA_DIR = GENESIS_ROOT / "data" / "youtube"
DATA_DIR.mkdir(parents=True, exist_ok=True)


@dataclass
class VideoInfo:
    """Information about a YouTube video."""
    video_id: str
    title: str
    channel: str
    channel_id: str
    published_at: str
    duration: Optional[str] = None
    description: Optional[str] = None
    thumbnail_url: Optional[str] = None
    view_count: Optional[int] = None
    discovered_at: str = None
    source: str = "api"  # api, takeout, subscription

    def __post_init__(self):
        if self.discovered_at is None:
            self.discovered_at = datetime.utcnow().isoformat() + "Z"


class PostgreSQLDeduplicator:
    """Check PostgreSQL for already-processed videos."""

    def __init__(self):
        self.conn = None
        self._connect()

    def _connect(self):
        """Connect to Elestio PostgreSQL."""
        try:
            import psycopg2

            # Use correct Elestio domain (a.elestio.app, not vm.elestio.app)
            self.conn = psycopg2.connect(
                host="postgresql-genesis-u50607.a.elestio.app",
                port=25432,
                user="postgres",
                password="etY0eog17tD-dDuj--IRH",
                database="postgres",
                connect_timeout=10
            )
            logger.info("Connected to PostgreSQL for deduplication")
        except ImportError:
            logger.warning("psycopg2 not installed, deduplication disabled")
            self.conn = None
        except Exception as e:
            logger.warning(f"PostgreSQL connection failed: {e}")
            self.conn = None

    def get_processed_video_ids(self) -> Set[str]:
        """Get set of already-processed video IDs."""
        if not self.conn:
            return set()

        try:
            with self.conn.cursor() as cur:
                # Check both youtube_videos and processed_history tables
                cur.execute("""
                    SELECT video_id FROM youtube_videos
                    UNION
                    SELECT video_id FROM processed_history WHERE success = true
                """)
                return {row[0] for row in cur.fetchall()}
        except Exception as e:
            logger.warning(f"Failed to get processed videos: {e}")
            return set()

    def close(self):
        """Close database connection."""
        if self.conn:
            self.conn.close()


class YouTubeHistoryFetcher:
    """Fetches YouTube watch history and related videos."""

    def __init__(self):
        self.youtube = None
        self.credentials = None
        self._init_api()
        self.deduplicator = PostgreSQLDeduplicator()

    def _init_api(self):
        """Initialize YouTube API client."""
        try:
            from googleapiclient.discovery import build
            from core.youtube.youtube_oauth import YouTubeOAuth

            oauth = YouTubeOAuth()
            self.credentials = oauth.get_credentials()

            if self.credentials:
                self.youtube = build('youtube', 'v3', credentials=self.credentials)
                logger.info("YouTube API initialized successfully")
            else:
                logger.warning("No OAuth credentials available")
                logger.warning("Run: python core/youtube/youtube_oauth.py --setup")
        except ImportError as e:
            logger.error(f"Required packages not installed: {e}")
            logger.error("Run: pip install google-api-python-client google-auth-oauthlib")
        except Exception as e:
            logger.error(f"Failed to initialize YouTube API: {e}")

    def fetch_liked_videos(self, max_results: int = 50) -> List[VideoInfo]:
        """Fetch user's liked videos (proxy for engagement)."""
        if not self.youtube:
            logger.error("YouTube API not initialized")
            return []

        videos = []
        try:
            request = self.youtube.videos().list(
                part='snippet,contentDetails,statistics',
                myRating='like',
                maxResults=min(max_results, 50)
            )

            while request and len(videos) < max_results:
                response = request.execute()

                for item in response.get('items', []):
                    snippet = item['snippet']
                    content = item.get('contentDetails', {})
                    stats = item.get('statistics', {})

                    videos.append(VideoInfo(
                        video_id=item['id'],
                        title=snippet['title'],
                        channel=snippet['channelTitle'],
                        channel_id=snippet['channelId'],
                        published_at=snippet['publishedAt'],
                        duration=content.get('duration'),
                        description=snippet.get('description', '')[:500],
                        thumbnail_url=snippet.get('thumbnails', {}).get('medium', {}).get('url'),
                        view_count=int(stats.get('viewCount', 0)) if stats.get('viewCount') else None,
                        source='liked'
                    ))

                # Get next page
                request = self.youtube.videos().list_next(request, response)

            logger.info(f"Fetched {len(videos)} liked videos")
            return videos

        except Exception as e:
            logger.error(f"Failed to fetch liked videos: {e}")
            return []

    def fetch_subscription_uploads(
        self,
        max_channels: int = 20,
        days_back: int = 7,
        videos_per_channel: int = 5
    ) -> List[VideoInfo]:
        """Fetch recent uploads from subscribed channels."""
        if not self.youtube:
            logger.error("YouTube API not initialized")
            return []

        videos = []
        cutoff_date = datetime.utcnow() - timedelta(days=days_back)

        try:
            # Get subscriptions
            subs_request = self.youtube.subscriptions().list(
                part='snippet',
                mine=True,
                maxResults=min(max_channels, 50)
            )
            subs_response = subs_request.execute()

            channel_ids = [
                item['snippet']['resourceId']['channelId']
                for item in subs_response.get('items', [])
            ]

            logger.info(f"Found {len(channel_ids)} subscriptions")

            # Get recent uploads from each channel
            for channel_id in channel_ids[:max_channels]:
                try:
                    # Get channel's uploads playlist
                    channel_request = self.youtube.channels().list(
                        part='contentDetails',
                        id=channel_id
                    )
                    channel_response = channel_request.execute()

                    if not channel_response.get('items'):
                        continue

                    uploads_playlist = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

                    # Get recent videos from uploads playlist
                    playlist_request = self.youtube.playlistItems().list(
                        part='snippet,contentDetails',
                        playlistId=uploads_playlist,
                        maxResults=videos_per_channel
                    )
                    playlist_response = playlist_request.execute()

                    for item in playlist_response.get('items', []):
                        snippet = item['snippet']
                        published = datetime.fromisoformat(
                            snippet['publishedAt'].replace('Z', '+00:00')
                        ).replace(tzinfo=None)

                        if published >= cutoff_date:
                            videos.append(VideoInfo(
                                video_id=item['contentDetails']['videoId'],
                                title=snippet['title'],
                                channel=snippet['channelTitle'],
                                channel_id=snippet['channelId'],
                                published_at=snippet['publishedAt'],
                                description=snippet.get('description', '')[:500],
                                thumbnail_url=snippet.get('thumbnails', {}).get('medium', {}).get('url'),
                                source='subscription'
                            ))

                except Exception as e:
                    logger.warning(f"Failed to get uploads for channel {channel_id}: {e}")
                    continue

            logger.info(f"Fetched {len(videos)} videos from subscriptions")
            return videos

        except Exception as e:
            logger.error(f"Failed to fetch subscription uploads: {e}")
            return []

    def parse_takeout_history(self, takeout_file: Path) -> List[VideoInfo]:
        """
        Parse YouTube watch history from Google Takeout export.

        Takeout exports watch history as HTML or JSON.
        Expected location: Takeout/YouTube and YouTube Music/history/watch-history.html
        """
        videos = []

        if not takeout_file.exists():
            logger.error(f"Takeout file not found: {takeout_file}")
            return []

        content = takeout_file.read_text(encoding='utf-8')

        # Try JSON format first
        if takeout_file.suffix == '.json':
            try:
                data = json.loads(content)
                for item in data:
                    if 'titleUrl' in item:
                        # Extract video ID from URL
                        url = item['titleUrl']
                        video_id_match = re.search(r'watch\?v=([a-zA-Z0-9_-]{11})', url)
                        if video_id_match:
                            videos.append(VideoInfo(
                                video_id=video_id_match.group(1),
                                title=item.get('title', 'Unknown'),
                                channel=item.get('subtitles', [{}])[0].get('name', 'Unknown'),
                                channel_id='',
                                published_at=item.get('time', ''),
                                source='takeout'
                            ))
                logger.info(f"Parsed {len(videos)} videos from Takeout JSON")
                return videos
            except json.JSONDecodeError:
                pass

        # Try HTML format
        if takeout_file.suffix in ['.html', '.htm']:
            # Parse watch history HTML
            # Format: <a href="https://www.youtube.com/watch?v=VIDEO_ID">Title</a>
            pattern = r'href="https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})"[^>]*>([^<]+)</a>'
            matches = re.findall(pattern, content)

            for video_id, title in matches:
                videos.append(VideoInfo(
                    video_id=video_id,
                    title=title.strip(),
                    channel='Unknown',
                    channel_id='',
                    published_at='',
                    source='takeout'
                ))

            logger.info(f"Parsed {len(videos)} videos from Takeout HTML")
            return videos

        logger.warning(f"Unknown Takeout format: {takeout_file.suffix}")
        return []

    def fetch_all(
        self,
        days_back: int = 7,
        include_liked: bool = True,
        include_subscriptions: bool = True,
        takeout_file: Optional[Path] = None,
        deduplicate: bool = True
    ) -> List[VideoInfo]:
        """
        Fetch videos from all available sources.

        Args:
            days_back: How many days back to look
            include_liked: Include liked videos
            include_subscriptions: Include subscription uploads
            takeout_file: Path to Google Takeout export
            deduplicate: Remove already-processed videos

        Returns:
            List of VideoInfo objects for new videos
        """
        all_videos: Dict[str, VideoInfo] = {}

        # Fetch from each source
        if include_liked and self.youtube:
            for video in self.fetch_liked_videos(max_results=100):
                all_videos[video.video_id] = video

        if include_subscriptions and self.youtube:
            for video in self.fetch_subscription_uploads(days_back=days_back):
                if video.video_id not in all_videos:
                    all_videos[video.video_id] = video

        if takeout_file:
            for video in self.parse_takeout_history(takeout_file):
                if video.video_id not in all_videos:
                    all_videos[video.video_id] = video

        videos = list(all_videos.values())
        logger.info(f"Total unique videos found: {len(videos)}")

        # Deduplicate against PostgreSQL
        if deduplicate:
            processed_ids = self.deduplicator.get_processed_video_ids()
            original_count = len(videos)
            videos = [v for v in videos if v.video_id not in processed_ids]
            skipped = original_count - len(videos)
            if skipped > 0:
                logger.info(f"Skipped {skipped} already-processed videos")

        logger.info(f"New videos to process: {len(videos)}")
        return videos

    def save_results(self, videos: List[VideoInfo], output_path: Optional[Path] = None) -> Path:
        """Save video list to JSON file."""
        if output_path is None:
            timestamp = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
            output_path = DATA_DIR / f"history_fetch_{timestamp}.json"

        output_path.parent.mkdir(parents=True, exist_ok=True)

        data = {
            "fetched_at": datetime.utcnow().isoformat() + "Z",
            "count": len(videos),
            "videos": [asdict(v) for v in videos]
        }

        with open(output_path, 'w') as f:
            json.dump(data, f, indent=2)

        logger.info(f"Saved {len(videos)} videos to: {output_path}")
        return output_path

    def close(self):
        """Clean up resources."""
        self.deduplicator.close()


def main():
    parser = argparse.ArgumentParser(
        description="Fetch YouTube watch history for Genesis knowledge pipeline"
    )

    parser.add_argument(
        '--days', '-d',
        type=int,
        default=7,
        help='Days back to look for videos (default: 7)'
    )
    parser.add_argument(
        '--takeout', '-t',
        type=Path,
        help='Path to Google Takeout watch history file'
    )
    parser.add_argument(
        '--subscriptions', '-s',
        action='store_true',
        help='Include recent uploads from subscriptions'
    )
    parser.add_argument(
        '--liked', '-l',
        action='store_true',
        help='Include liked videos'
    )
    parser.add_argument(
        '--all', '-a',
        action='store_true',
        help='Include all sources (liked + subscriptions)'
    )
    parser.add_argument(
        '--no-dedupe',
        action='store_true',
        help='Skip deduplication against processed videos'
    )
    parser.add_argument(
        '--output', '-o',
        type=Path,
        help='Output JSON file path'
    )
    parser.add_argument(
        '--json',
        action='store_true',
        help='Output JSON to stdout'
    )

    args = parser.parse_args()

    # Default to all sources if none specified
    include_liked = args.liked or args.all or (not args.subscriptions and not args.takeout)
    include_subscriptions = args.subscriptions or args.all

    fetcher = YouTubeHistoryFetcher()

    try:
        videos = fetcher.fetch_all(
            days_back=args.days,
            include_liked=include_liked,
            include_subscriptions=include_subscriptions,
            takeout_file=args.takeout,
            deduplicate=not args.no_dedupe
        )

        if args.json:
            print(json.dumps([asdict(v) for v in videos], indent=2))
        else:
            output_path = fetcher.save_results(videos, args.output)

            print(f"\n{'='*60}")
            print(f"YouTube History Fetch Complete")
            print(f"{'='*60}")
            print(f"  Videos found: {len(videos)}")
            print(f"  Output file:  {output_path}")
            print(f"{'='*60}")

            if videos:
                print(f"\nSample videos:")
                for v in videos[:5]:
                    print(f"  - [{v.video_id}] {v.title[:50]}...")

    finally:
        fetcher.close()


if __name__ == "__main__":
    main()
