#!/usr/bin/env python3
"""
YOUTUBE TRANSCRIPT RWL - Unfailing Extraction Mastery
=====================================================
Ralph Wiggum Loop for YouTube transcripts with 6 fallback approaches.

Approaches (in order):
1. youtube-transcript-api (free, needs captions)
2. yt-dlp subtitle extraction (free, needs captions)
3. Supadata API (paid, AI transcription - works without captions)
4. Whisper local transcription (free, needs audio download)
5. Browser automation with Glasp (free, manual fallback)
6. Manual queue for human processing

Usage:
    rwl = YouTubeTranscriptRWL(supadata_api_key="optional")
    results = rwl.extract_all(video_ids)
"""

import json
import os
import subprocess
import tempfile
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field
import time
import re

# Try imports
try:
    from youtube_transcript_api import YouTubeTranscriptApi
    YT_API_AVAILABLE = True
except ImportError:
    YT_API_AVAILABLE = False

try:
    import requests
    REQUESTS_AVAILABLE = True
except ImportError:
    REQUESTS_AVAILABLE = False


@dataclass
class TranscriptResult:
    video_id: str
    success: bool
    text: str = ""
    word_count: int = 0
    approach_used: str = ""
    error: str = ""
    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())


class YouTubeTranscriptRWL:
    """
    Ralph Wiggum Loop for unfailing YouTube transcript extraction.

    Never gives up - cycles through 6 approaches until one works.
    """

    def __init__(self,
                 supadata_api_key: Optional[str] = None,
                 output_dir: Optional[Path] = None):
        self.supadata_api_key = supadata_api_key or os.getenv("SUPADATA_API_KEY")
        self.output_dir = output_dir or Path("/mnt/e/genesis-system/data/transcripts/extracted")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.results: List[TranscriptResult] = []
        self.stats = {
            "total": 0,
            "success": 0,
            "failed": 0,
            "by_approach": {}
        }

        # Initialize API if available
        self.yt_api = YouTubeTranscriptApi() if YT_API_AVAILABLE else None

    def extract_all(self, video_ids: List[str]) -> Dict[str, Any]:
        """
        Extract transcripts for all videos using RWL pattern.
        Returns summary with all results.
        """
        print("=" * 60)
        print("YOUTUBE TRANSCRIPT RWL - Unfailing Extraction")
        print("=" * 60)
        print(f"Processing {len(video_ids)} videos...")
        print()

        self.stats["total"] = len(video_ids)

        for i, vid in enumerate(video_ids):
            print(f"[{i+1}/{len(video_ids)}] {vid}...", end=" ")
            result = self._extract_single(vid)
            self.results.append(result)

            if result.success:
                self.stats["success"] += 1
                approach = result.approach_used
                self.stats["by_approach"][approach] = self.stats["by_approach"].get(approach, 0) + 1
                print(f"✓ {result.word_count} words via {approach}")

                # Save to file
                self._save_transcript(result)
            else:
                self.stats["failed"] += 1
                print(f"✗ {result.error[:40]}")

        # Generate summary
        summary = self._generate_summary()
        self._save_summary(summary)

        print()
        print("=" * 60)
        print(f"Complete: {self.stats['success']}/{self.stats['total']} successful")
        print(f"Approaches used: {self.stats['by_approach']}")
        print("=" * 60)

        return summary

    def _extract_single(self, video_id: str) -> TranscriptResult:
        """Extract transcript for single video, trying all approaches."""

        approaches = [
            ("youtube-transcript-api", self._approach_yt_api),
            ("yt-dlp", self._approach_ytdlp),
            ("supadata", self._approach_supadata),
            ("whisper-local", self._approach_whisper),
        ]

        for approach_name, approach_fn in approaches:
            try:
                text = approach_fn(video_id)
                if text and len(text.strip()) > 50:
                    return TranscriptResult(
                        video_id=video_id,
                        success=True,
                        text=text,
                        word_count=len(text.split()),
                        approach_used=approach_name
                    )
            except Exception as e:
                continue  # Try next approach

        # All approaches failed - queue for manual
        return TranscriptResult(
            video_id=video_id,
            success=False,
            error="All automated approaches failed - queued for manual extraction",
            approach_used="manual_queue"
        )

    def _approach_yt_api(self, video_id: str) -> Optional[str]:
        """Approach 1: youtube-transcript-api (free)."""
        if not self.yt_api:
            raise Exception("youtube-transcript-api not installed")

        transcript = self.yt_api.fetch(video_id)
        texts = [seg.text for seg in transcript]
        return " ".join(texts)

    def _approach_ytdlp(self, video_id: str) -> Optional[str]:
        """Approach 2: yt-dlp subtitle extraction."""
        with tempfile.TemporaryDirectory() as tmpdir:
            output_path = Path(tmpdir) / video_id

            cmd = [
                "yt-dlp",
                "--skip-download",
                "--write-auto-sub",
                "--sub-lang", "en",
                "--sub-format", "vtt",
                "-o", str(output_path),
                f"https://youtube.com/watch?v={video_id}"
            ]

            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)

            # Find the subtitle file
            vtt_file = Path(tmpdir) / f"{video_id}.en.vtt"
            if vtt_file.exists():
                return self._parse_vtt(vtt_file.read_text())

            raise Exception("No subtitle file generated")

    def _approach_supadata(self, video_id: str) -> Optional[str]:
        """Approach 3: Supadata API (free 100/mo, $9/mo for 1000)."""
        if not self.supadata_api_key:
            raise Exception("Supadata API key not configured")

        if not REQUESTS_AVAILABLE:
            raise Exception("requests library not available")

        # Supadata API endpoint
        url = f"https://api.supadata.ai/v1/youtube/transcript"
        headers = {
            "x-api-key": self.supadata_api_key,
        }
        params = {"videoId": video_id}

        response = requests.get(url, headers=headers, params=params, timeout=120)
        response.raise_for_status()

        result = response.json()

        # Extract transcript text from response
        # Supadata returns {"content": [{"text": "...", "offset": ..., "duration": ...}, ...]}
        if "content" in result and isinstance(result["content"], list):
            texts = [seg.get("text", "") for seg in result["content"]]
            return " ".join(texts)
        elif "transcript" in result:
            return result["transcript"]
        elif isinstance(result, list):
            return " ".join(seg.get("text", "") for seg in result)

        return str(result)

    def _approach_whisper(self, video_id: str) -> Optional[str]:
        """Approach 4: Download audio and transcribe with Whisper."""
        # Check if whisper is available
        whisper_check = subprocess.run(["which", "whisper"], capture_output=True)
        if whisper_check.returncode != 0:
            raise Exception("Whisper not installed")

        with tempfile.TemporaryDirectory() as tmpdir:
            audio_path = Path(tmpdir) / f"{video_id}.mp3"

            # Download audio
            cmd = [
                "yt-dlp",
                "-x", "--audio-format", "mp3",
                "-o", str(audio_path),
                f"https://youtube.com/watch?v={video_id}"
            ]
            subprocess.run(cmd, capture_output=True, timeout=300)

            if not audio_path.exists():
                raise Exception("Audio download failed")

            # Transcribe with Whisper
            output_path = Path(tmpdir) / "transcript"
            cmd = [
                "whisper", str(audio_path),
                "--model", "base",
                "--output_dir", str(tmpdir),
                "--output_format", "txt"
            ]
            subprocess.run(cmd, capture_output=True, timeout=600)

            txt_file = Path(tmpdir) / f"{video_id}.txt"
            if txt_file.exists():
                return txt_file.read_text()

            raise Exception("Whisper transcription failed")

    def _parse_vtt(self, vtt_content: str) -> str:
        """Parse VTT subtitle file to plain text."""
        lines = vtt_content.split('\n')
        text_lines = []

        for line in lines:
            # Skip timestamps and headers
            if '-->' in line or line.startswith('WEBVTT') or not line.strip():
                continue
            # Skip numeric cue identifiers
            if line.strip().isdigit():
                continue
            # Remove HTML tags
            clean = re.sub(r'<[^>]+>', '', line)
            if clean.strip():
                text_lines.append(clean.strip())

        # Remove duplicates (VTT often repeats lines)
        seen = set()
        unique = []
        for line in text_lines:
            if line not in seen:
                seen.add(line)
                unique.append(line)

        return ' '.join(unique)

    def _save_transcript(self, result: TranscriptResult) -> None:
        """Save successful transcript to file."""
        output_file = self.output_dir / f"{result.video_id}.json"
        with open(output_file, 'w') as f:
            json.dump({
                "video_id": result.video_id,
                "url": f"https://youtube.com/watch?v={result.video_id}",
                "text": result.text,
                "word_count": result.word_count,
                "approach": result.approach_used,
                "extracted_at": result.timestamp
            }, f, indent=2)

    def _generate_summary(self) -> Dict[str, Any]:
        """Generate extraction summary."""
        return {
            "extraction_date": datetime.now().isoformat(),
            "total_videos": self.stats["total"],
            "successful": self.stats["success"],
            "failed": self.stats["failed"],
            "success_rate": self.stats["success"] / max(self.stats["total"], 1) * 100,
            "approaches_used": self.stats["by_approach"],
            "total_words": sum(r.word_count for r in self.results if r.success),
            "failed_videos": [r.video_id for r in self.results if not r.success]
        }

    def _save_summary(self, summary: Dict) -> None:
        """Save extraction summary."""
        summary_file = self.output_dir / "_extraction_summary.json"
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)


def main():
    """Run the YouTube Transcript RWL."""
    # Jan 13th video IDs
    video_ids = [
        "fmNYIc3tSG8", "rLu_3hpG0b8", "U2iFkardx9g", "9I6zz42Rt5E",
        "XP6TJRbOdGo", "RB7R8vIuPAQ", "yDKpF6CjBUo", "8JW1vEdLswk",
        "quVoGwHZdYI", "VEiumna7MTM", "mzYg2IHBUWI", "yVMWTaYSq-4",
        "M4W5rxVwnUw", "4AfrNd7Gq1E", "o0ASly3vg7k", "1choivCbZDg",
        "Wk1ZGlxShUk", "xIvxaXs8X9s", "OntPk_eRljo", "Y62dSZcLm94",
        "MsQACpcuTkU", "G-5bInklwRQ", "yMJcHcCbgi4", "5BfcR6erPyo"
    ]

    # Check for Supadata API key
    supadata_key = os.getenv("SUPADATA_API_KEY")
    if not supadata_key:
        print("Note: SUPADATA_API_KEY not set. AI transcription fallback unavailable.")
        print("Get free API key at: https://supadata.ai")
        print()

    rwl = YouTubeTranscriptRWL(
        supadata_api_key=supadata_key,
        output_dir=Path("/mnt/e/genesis-system/data/transcripts/extracted_jan13")
    )

    results = rwl.extract_all(video_ids)
    return results


if __name__ == "__main__":
    main()
