#!/usr/bin/env python3
"""
Genesis YouTube Transcript Extractor
=====================================
Multi-method YouTube transcript extraction with fallback chain.

Methods:
1. youtube-transcript-api (primary, fast)
2. Browser automation via Playwright (fallback)
3. yt-dlp + Whisper (audio transcription fallback)
"""

import re
import json
import subprocess
from typing import Optional, Dict, List, Any
from dataclasses import dataclass
from datetime import datetime


@dataclass
class TranscriptSegment:
    """A single transcript segment with timing."""
    text: str
    start: float
    duration: float


@dataclass
class TranscriptResult:
    """Complete transcript extraction result."""
    video_id: str
    title: Optional[str]
    text: str
    segments: List[TranscriptSegment]
    language: str
    duration: float
    method: str  # 'api', 'browser', 'whisper'
    extracted_at: str
    error: Optional[str] = None


def extract_video_id(url: str) -> Optional[str]:
    """
    Extract video ID from various YouTube URL formats.

    Supports:
    - youtube.com/watch?v=ID
    - youtu.be/ID
    - youtube.com/embed/ID
    - youtube.com/v/ID
    """
    patterns = [
        r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/)([a-zA-Z0-9_-]{11})',
        r'^([a-zA-Z0-9_-]{11})$'  # Direct video ID
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None


def get_transcript_api(video_id: str, languages: List[str] = ['en']) -> TranscriptResult:
    """
    Primary method: Use youtube-transcript-api.
    Fast, reliable, no browser needed.
    """
    try:
        from youtube_transcript_api import YouTubeTranscriptApi

        # Create API instance and fetch transcript directly
        api = YouTubeTranscriptApi()

        # Try to fetch transcript (new API format)
        try:
            # Try with preferred languages first
            transcript_data = api.fetch(video_id, languages=languages)
        except Exception:
            # Fallback to any available transcript
            transcript_data = api.fetch(video_id)

        # Extract segments from the new API format
        segments = []
        for item in transcript_data:
            # New API returns FetchedTranscriptSnippet objects
            if hasattr(item, 'text'):
                segments.append(TranscriptSegment(
                    text=item.text,
                    start=item.start,
                    duration=item.duration
                ))
            else:
                # Fallback for dict-like items
                segments.append(TranscriptSegment(
                    text=item.get('text', ''),
                    start=item.get('start', 0),
                    duration=item.get('duration', 0)
                ))

        full_text = ' '.join([s.text for s in segments])
        total_duration = sum([s.duration for s in segments])

        # Get language from transcript if available
        used_language = getattr(transcript_data, 'language', 'en') if hasattr(transcript_data, 'language') else 'en'

        return TranscriptResult(
            video_id=video_id,
            title=None,  # API doesn't provide title
            text=full_text,
            segments=segments,
            language=used_language,
            duration=total_duration,
            method="api",
            extracted_at=datetime.now().isoformat()
        )

    except Exception as e:
        return TranscriptResult(
            video_id=video_id,
            title=None,
            text="",
            segments=[],
            language="",
            duration=0,
            method="api",
            extracted_at=datetime.now().isoformat(),
            error=str(e)
        )


def get_video_info(video_id: str) -> Dict[str, Any]:
    """Get video metadata using yt-dlp."""
    try:
        result = subprocess.run(
            ['yt-dlp', '--dump-json', '--no-download', f'https://youtube.com/watch?v={video_id}'],
            capture_output=True,
            text=True,
            timeout=30
        )
        if result.returncode == 0:
            return json.loads(result.stdout)
    except Exception:
        pass
    return {}


def get_transcript(
    url_or_id: str,
    languages: List[str] = ['en'],
    include_metadata: bool = True
) -> Dict[str, Any]:
    """
    Main entry point for transcript extraction.

    Args:
        url_or_id: YouTube URL or video ID
        languages: Preferred languages in order
        include_metadata: Whether to fetch video metadata

    Returns:
        Dictionary with transcript data and metadata
    """
    # Extract video ID
    video_id = extract_video_id(url_or_id)
    if not video_id:
        return {
            'success': False,
            'error': f'Could not extract video ID from: {url_or_id}'
        }

    # Try API method first
    result = get_transcript_api(video_id, languages)

    # Get video metadata if requested
    metadata = {}
    if include_metadata:
        info = get_video_info(video_id)
        metadata = {
            'title': info.get('title'),
            'channel': info.get('channel'),
            'upload_date': info.get('upload_date'),
            'view_count': info.get('view_count'),
            'like_count': info.get('like_count'),
            'description': info.get('description', '')[:500]  # First 500 chars
        }
        if metadata['title']:
            result.title = metadata['title']

    if result.error:
        return {
            'success': False,
            'video_id': video_id,
            'error': result.error,
            'method': result.method,
            'fallback_required': True,
            'metadata': metadata
        }

    return {
        'success': True,
        'video_id': video_id,
        'title': result.title,
        'text': result.text,
        'segments': [
            {'text': s.text, 'start': s.start, 'duration': s.duration}
            for s in result.segments
        ],
        'language': result.language,
        'duration': result.duration,
        'method': result.method,
        'extracted_at': result.extracted_at,
        'metadata': metadata,
        'word_count': len(result.text.split()),
        'segment_count': len(result.segments)
    }


def summarize_transcript(transcript_text: str, max_length: int = 500) -> str:
    """
    Create a basic summary of the transcript.
    For LLM-powered summaries, use the /youtube:summary command.
    """
    words = transcript_text.split()
    if len(words) <= max_length:
        return transcript_text

    # Simple extractive summary: first and last portions
    first_part = ' '.join(words[:max_length // 2])
    last_part = ' '.join(words[-max_length // 2:])

    return f"{first_part}\n\n[...]\n\n{last_part}"


def search_transcript(transcript_text: str, query: str) -> List[Dict[str, Any]]:
    """
    Search for occurrences of a query in the transcript.
    """
    results = []
    query_lower = query.lower()
    sentences = transcript_text.replace('\n', ' ').split('. ')

    for i, sentence in enumerate(sentences):
        if query_lower in sentence.lower():
            results.append({
                'index': i,
                'sentence': sentence.strip(),
                'context': sentences[max(0, i-1):min(len(sentences), i+2)]
            })

    return results


# CLI interface
if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        print("Usage: python youtube_extractor.py <youtube_url_or_id>")
        sys.exit(1)

    url = sys.argv[1]
    result = get_transcript(url)

    if result['success']:
        print(f"Video: {result.get('title', result['video_id'])}")
        print(f"Language: {result['language']}")
        print(f"Duration: {result['duration']:.0f}s")
        print(f"Words: {result['word_count']}")
        print(f"Segments: {result['segment_count']}")
        print(f"\n{'='*60}\n")
        print(result['text'][:2000] + "..." if len(result['text']) > 2000 else result['text'])
    else:
        print(f"Error: {result['error']}")
        sys.exit(1)
