#!/usr/bin/env python3
"""
Multi-Level Transcript Extraction Orchestrator

Orchestrates transcript extraction across multiple methods with automatic
fallback, prioritizing free methods before paid APIs.

Priority Stack:
1. youtube-transcript-api (FREE) - Native YouTube captions
2. yt-dlp (FREE) - Download and parse subtitles
3. Supadata.ai (PAID) - AI-powered transcription with quota tracking
4. Cache (FREE) - Previously extracted transcripts

Usage:
    orchestrator = TranscriptOrchestrator()
    result = await orchestrator.get_transcript("VIDEO_ID")

Author: Genesis System
Version: 1.0.0
"""

import os
import sys
import json
import asyncio
import logging
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
from dataclasses import dataclass, field, asdict
from enum import Enum
import time

# Add genesis-system to path
sys.path.insert(0, '/mnt/e/genesis-system')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Paths
GENESIS_ROOT = Path("/mnt/e/genesis-system")
CACHE_DIR = GENESIS_ROOT / "data" / "youtube" / "transcript_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)


class ExtractionMethod(Enum):
    """Transcript extraction methods in priority order."""
    YOUTUBE_TRANSCRIPT_API = "youtube_transcript_api"  # Level 1 - FREE
    YTDLP = "yt_dlp"                                   # Level 2 - FREE
    SUPADATA = "supadata"                              # Level 3 - PAID
    CACHE = "cache"                                     # Level 4 - FREE (cached)


class ExtractionStatus(Enum):
    """Extraction result status."""
    SUCCESS = "success"
    NO_TRANSCRIPT = "no_transcript"
    QUOTA_EXHAUSTED = "quota_exhausted"
    RATE_LIMITED = "rate_limited"
    VIDEO_UNAVAILABLE = "video_unavailable"
    ERROR = "error"


@dataclass
class TranscriptSegment:
    """A segment of transcript with timing."""
    text: str
    start: float
    duration: float
    language: str = "en"


@dataclass
class TranscriptResult:
    """Result from transcript extraction."""
    video_id: str
    status: ExtractionStatus
    method: Optional[ExtractionMethod] = None
    text: Optional[str] = None
    segments: List[TranscriptSegment] = field(default_factory=list)
    language: str = "en"
    auto_generated: bool = False
    extraction_time_ms: int = 0
    error: Optional[str] = None
    extracted_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")
    metadata: Dict[str, Any] = field(default_factory=dict)

    @property
    def full_text(self) -> str:
        """Get full transcript text."""
        if self.text:
            return self.text
        return " ".join(seg.text for seg in self.segments)


class YouTubeTranscriptAPIExtractor:
    """Level 1: youtube-transcript-api (FREE)."""

    def __init__(self):
        self.available = self._check_available()

    def _check_available(self) -> bool:
        try:
            from youtube_transcript_api import YouTubeTranscriptApi
            return True
        except ImportError:
            logger.warning("youtube-transcript-api not installed")
            return False

    async def extract(
        self,
        video_id: str,
        languages: List[str] = None
    ) -> TranscriptResult:
        """Extract transcript using youtube-transcript-api."""
        if not self.available:
            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.ERROR,
                error="youtube-transcript-api not installed"
            )

        languages = languages or ["en"]
        start_time = time.time()

        try:
            from youtube_transcript_api import YouTubeTranscriptApi
            from youtube_transcript_api._errors import (
                TranscriptsDisabled,
                NoTranscriptFound,
                VideoUnavailable
            )

            # Run in thread pool for async compatibility
            loop = asyncio.get_event_loop()
            transcript_data = await loop.run_in_executor(
                None,
                lambda: YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
            )

            segments = [
                TranscriptSegment(
                    text=seg["text"],
                    start=seg["start"],
                    duration=seg["duration"],
                    language=languages[0]
                )
                for seg in transcript_data
            ]

            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.SUCCESS,
                method=ExtractionMethod.YOUTUBE_TRANSCRIPT_API,
                segments=segments,
                language=languages[0],
                extraction_time_ms=int((time.time() - start_time) * 1000)
            )

        except Exception as e:
            error_str = str(e).lower()

            if "disabled" in error_str or "no transcript" in error_str:
                status = ExtractionStatus.NO_TRANSCRIPT
            elif "unavailable" in error_str:
                status = ExtractionStatus.VIDEO_UNAVAILABLE
            else:
                status = ExtractionStatus.ERROR

            return TranscriptResult(
                video_id=video_id,
                status=status,
                method=ExtractionMethod.YOUTUBE_TRANSCRIPT_API,
                error=str(e),
                extraction_time_ms=int((time.time() - start_time) * 1000)
            )


class YTDLPExtractor:
    """Level 2: yt-dlp subtitle download (FREE)."""

    def __init__(self, download_dir: Optional[Path] = None):
        self.download_dir = download_dir or CACHE_DIR / "ytdlp_subs"
        self.download_dir.mkdir(parents=True, exist_ok=True)
        self.available = self._check_available()

    def _check_available(self) -> bool:
        try:
            import yt_dlp
            return True
        except ImportError:
            logger.warning("yt-dlp not installed")
            return False

    async def extract(
        self,
        video_id: str,
        languages: List[str] = None
    ) -> TranscriptResult:
        """Extract transcript using yt-dlp."""
        if not self.available:
            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.ERROR,
                error="yt-dlp not installed"
            )

        languages = languages or ["en"]
        start_time = time.time()

        try:
            import yt_dlp

            url = f"https://www.youtube.com/watch?v={video_id}"
            output_template = str(self.download_dir / f"{video_id}.%(ext)s")

            ydl_opts = {
                "writesubtitles": True,
                "writeautomaticsub": True,
                "subtitleslangs": languages,
                "skip_download": True,
                "outtmpl": output_template,
                "quiet": True,
                "no_warnings": True
            }

            loop = asyncio.get_event_loop()
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                await loop.run_in_executor(None, lambda: ydl.download([url]))

            # Find subtitle file
            sub_file = None
            for lang in languages + ["en"]:
                for ext in [".vtt", ".srt", ".json3"]:
                    candidate = self.download_dir / f"{video_id}.{lang}{ext}"
                    if candidate.exists():
                        sub_file = candidate
                        break
                if sub_file:
                    break

            if not sub_file:
                return TranscriptResult(
                    video_id=video_id,
                    status=ExtractionStatus.NO_TRANSCRIPT,
                    method=ExtractionMethod.YTDLP,
                    error="No subtitle file found",
                    extraction_time_ms=int((time.time() - start_time) * 1000)
                )

            # Parse subtitle file
            segments = self._parse_subtitle_file(sub_file, languages[0])

            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.SUCCESS,
                method=ExtractionMethod.YTDLP,
                segments=segments,
                language=languages[0],
                extraction_time_ms=int((time.time() - start_time) * 1000)
            )

        except Exception as e:
            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.ERROR,
                method=ExtractionMethod.YTDLP,
                error=str(e),
                extraction_time_ms=int((time.time() - start_time) * 1000)
            )

    def _parse_subtitle_file(
        self,
        file_path: Path,
        language: str
    ) -> List[TranscriptSegment]:
        """Parse VTT or SRT subtitle file."""
        import re

        segments = []
        content = file_path.read_text(encoding='utf-8')

        if file_path.suffix == ".vtt":
            # Parse VTT
            lines = content.split('\n')
            current_start = 0.0
            current_text = None

            for line in lines:
                line = line.strip()
                if not line or line == 'WEBVTT':
                    continue
                if line.startswith('Kind:') or line.startswith('Language:'):
                    continue

                if '-->' in line:
                    ts_match = re.match(r'(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->', line)
                    if ts_match:
                        current_start = self._parse_timestamp(ts_match.group(1))
                    continue

                if line.isdigit():
                    continue

                # Clean tags
                clean_text = re.sub(r'<[^>]+>', '', line).strip()

                if clean_text and clean_text != current_text and len(clean_text) > 1:
                    segments.append(TranscriptSegment(
                        text=clean_text,
                        start=current_start,
                        duration=3.0,
                        language=language
                    ))
                    current_text = clean_text

        elif file_path.suffix == ".srt":
            # Parse SRT
            pattern = r'\d+\n(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})\n(.*?)(?=\n\n|\Z)'
            matches = re.findall(pattern, content, re.DOTALL)

            for start_str, end_str, text in matches:
                start_str = start_str.replace(',', '.')
                end_str = end_str.replace(',', '.')
                start = self._parse_timestamp(start_str)
                end = self._parse_timestamp(end_str)

                if text.strip():
                    segments.append(TranscriptSegment(
                        text=text.strip(),
                        start=start,
                        duration=end - start,
                        language=language
                    ))

        return segments

    def _parse_timestamp(self, ts: str) -> float:
        """Parse timestamp to seconds."""
        parts = ts.split(':')
        if len(parts) == 3:
            h, m, s = parts
            return float(h) * 3600 + float(m) * 60 + float(s)
        elif len(parts) == 2:
            m, s = parts
            return float(m) * 60 + float(s)
        return float(parts[0])


class SupadataExtractor:
    """Level 3: Supadata.ai API (PAID with quota tracking)."""

    def __init__(self):
        self.api_key = self._load_api_key()
        self.available = bool(self.api_key)
        self.quota_tracker = None

        if self.available:
            try:
                from core.youtube.supadata_tracker import SupadataQuotaTracker
                self.quota_tracker = SupadataQuotaTracker()
            except Exception as e:
                logger.warning(f"Quota tracker unavailable: {e}")

    def _load_api_key(self) -> Optional[str]:
        """Load Supadata API key."""
        # Check environment
        api_key = os.environ.get("SUPADATA_API_KEY")
        if api_key:
            return api_key

        # Check secrets.env
        secrets_path = GENESIS_ROOT / "config" / "secrets.env"
        if secrets_path.exists():
            with open(secrets_path) as f:
                for line in f:
                    if line.startswith("SUPADATA_API_KEY="):
                        return line.strip().split("=", 1)[1]

        return None

    async def extract(
        self,
        video_id: str,
        languages: List[str] = None
    ) -> TranscriptResult:
        """Extract transcript using Supadata.ai."""
        if not self.available:
            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.ERROR,
                error="Supadata API key not configured"
            )

        # Check quota
        if self.quota_tracker:
            if not self.quota_tracker.is_quota_available():
                return TranscriptResult(
                    video_id=video_id,
                    status=ExtractionStatus.QUOTA_EXHAUSTED,
                    method=ExtractionMethod.SUPADATA,
                    error="Supadata quota exhausted for this month"
                )

        languages = languages or ["en"]
        start_time = time.time()

        try:
            import requests

            headers = {
                "x-api-key": self.api_key,
                "Content-Type": "application/json"
            }

            params = {
                "url": f"https://www.youtube.com/watch?v={video_id}",
                "mode": "auto"
            }

            if languages:
                params["lang"] = languages[0]

            response = requests.get(
                "https://api.supadata.ai/v1/transcript",
                headers=headers,
                params=params,
                timeout=60
            )

            # Record usage
            if self.quota_tracker:
                self.quota_tracker.record_usage(
                    video_id,
                    status="success" if response.status_code == 200 else "failed"
                )

            if response.status_code == 429:
                return TranscriptResult(
                    video_id=video_id,
                    status=ExtractionStatus.RATE_LIMITED,
                    method=ExtractionMethod.SUPADATA,
                    error="Supadata rate limited",
                    extraction_time_ms=int((time.time() - start_time) * 1000)
                )

            if response.status_code != 200:
                return TranscriptResult(
                    video_id=video_id,
                    status=ExtractionStatus.ERROR,
                    method=ExtractionMethod.SUPADATA,
                    error=f"Supadata API error: {response.status_code}",
                    extraction_time_ms=int((time.time() - start_time) * 1000)
                )

            data = response.json()

            # Parse response
            if "content" in data:
                segments = [
                    TranscriptSegment(
                        text=seg.get("text", ""),
                        start=seg.get("offset", 0) / 1000,  # Convert ms to seconds
                        duration=seg.get("duration", 0) / 1000,
                        language=data.get("lang", "en")
                    )
                    for seg in data["content"]
                ]
            else:
                # Text-only response
                segments = [
                    TranscriptSegment(
                        text=data.get("text", ""),
                        start=0,
                        duration=0,
                        language=data.get("lang", "en")
                    )
                ]

            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.SUCCESS,
                method=ExtractionMethod.SUPADATA,
                segments=segments,
                language=data.get("lang", "en"),
                extraction_time_ms=int((time.time() - start_time) * 1000),
                metadata={"supadata_response": True}
            )

        except Exception as e:
            if self.quota_tracker:
                self.quota_tracker.record_usage(video_id, status="error")

            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.ERROR,
                method=ExtractionMethod.SUPADATA,
                error=str(e),
                extraction_time_ms=int((time.time() - start_time) * 1000)
            )


class CacheExtractor:
    """Level 4: Cached transcripts (FREE)."""

    def __init__(self, cache_dir: Optional[Path] = None):
        self.cache_dir = cache_dir or CACHE_DIR
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    async def extract(
        self,
        video_id: str,
        languages: List[str] = None
    ) -> TranscriptResult:
        """Get transcript from cache."""
        cache_file = self.cache_dir / f"{video_id}.json"

        if not cache_file.exists():
            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.NO_TRANSCRIPT,
                method=ExtractionMethod.CACHE,
                error="Not in cache"
            )

        try:
            with open(cache_file) as f:
                data = json.load(f)

            segments = [
                TranscriptSegment(**seg)
                for seg in data.get("segments", [])
            ]

            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.SUCCESS,
                method=ExtractionMethod.CACHE,
                segments=segments,
                language=data.get("language", "en"),
                metadata={"cached": True, "cached_at": data.get("cached_at")}
            )

        except Exception as e:
            return TranscriptResult(
                video_id=video_id,
                status=ExtractionStatus.ERROR,
                method=ExtractionMethod.CACHE,
                error=str(e)
            )

    def save_to_cache(self, result: TranscriptResult) -> bool:
        """Save transcript to cache."""
        if result.status != ExtractionStatus.SUCCESS:
            return False

        try:
            cache_file = self.cache_dir / f"{result.video_id}.json"
            data = {
                "video_id": result.video_id,
                "language": result.language,
                "method": result.method.value if result.method else None,
                "segments": [asdict(seg) for seg in result.segments],
                "cached_at": datetime.utcnow().isoformat() + "Z"
            }
            with open(cache_file, 'w') as f:
                json.dump(data, f, indent=2)
            return True
        except Exception as e:
            logger.error(f"Cache save failed: {e}")
            return False


class TranscriptOrchestrator:
    """
    Multi-level transcript extraction orchestrator.

    Tries extraction methods in priority order:
    1. youtube-transcript-api (FREE)
    2. yt-dlp (FREE)
    3. Supadata.ai (PAID)
    4. Cache (previously extracted)
    """

    def __init__(
        self,
        use_cache: bool = True,
        save_to_cache: bool = True,
        use_supadata: bool = True
    ):
        self.use_cache = use_cache
        self.save_to_cache = save_to_cache
        self.use_supadata = use_supadata

        # Initialize extractors
        self.cache = CacheExtractor()
        self.youtube_api = YouTubeTranscriptAPIExtractor()
        self.ytdlp = YTDLPExtractor()
        self.supadata = SupadataExtractor() if use_supadata else None

        # Stats
        self.stats = {method.value: {"attempts": 0, "successes": 0} for method in ExtractionMethod}

    async def get_transcript(
        self,
        video_id: str,
        languages: List[str] = None
    ) -> TranscriptResult:
        """
        Get transcript using multi-level fallback.

        Args:
            video_id: YouTube video ID
            languages: Preferred languages (default: ["en"])

        Returns:
            TranscriptResult with transcript or error info
        """
        languages = languages or ["en"]

        # Level 0: Check cache first
        if self.use_cache:
            result = await self.cache.extract(video_id, languages)
            if result.status == ExtractionStatus.SUCCESS:
                self.stats["cache"]["attempts"] += 1
                self.stats["cache"]["successes"] += 1
                logger.info(f"[{video_id}] Cache hit")
                return result

        # Level 1: youtube-transcript-api (FREE)
        if self.youtube_api.available:
            self.stats["youtube_transcript_api"]["attempts"] += 1
            result = await self.youtube_api.extract(video_id, languages)
            if result.status == ExtractionStatus.SUCCESS:
                self.stats["youtube_transcript_api"]["successes"] += 1
                logger.info(f"[{video_id}] youtube-transcript-api success")
                if self.save_to_cache:
                    self.cache.save_to_cache(result)
                return result
            logger.debug(f"[{video_id}] youtube-transcript-api failed: {result.error}")

        # Level 2: yt-dlp (FREE)
        if self.ytdlp.available:
            self.stats["yt_dlp"]["attempts"] += 1
            result = await self.ytdlp.extract(video_id, languages)
            if result.status == ExtractionStatus.SUCCESS:
                self.stats["yt_dlp"]["successes"] += 1
                logger.info(f"[{video_id}] yt-dlp success")
                if self.save_to_cache:
                    self.cache.save_to_cache(result)
                return result
            logger.debug(f"[{video_id}] yt-dlp failed: {result.error}")

        # Level 3: Supadata.ai (PAID)
        if self.use_supadata and self.supadata and self.supadata.available:
            self.stats["supadata"]["attempts"] += 1
            result = await self.supadata.extract(video_id, languages)
            if result.status == ExtractionStatus.SUCCESS:
                self.stats["supadata"]["successes"] += 1
                logger.info(f"[{video_id}] Supadata success")
                if self.save_to_cache:
                    self.cache.save_to_cache(result)
                return result
            elif result.status == ExtractionStatus.QUOTA_EXHAUSTED:
                logger.warning(f"[{video_id}] Supadata quota exhausted")
            else:
                logger.debug(f"[{video_id}] Supadata failed: {result.error}")

        # All methods failed
        return TranscriptResult(
            video_id=video_id,
            status=ExtractionStatus.NO_TRANSCRIPT,
            error="All extraction methods failed"
        )

    async def get_batch_transcripts(
        self,
        video_ids: List[str],
        concurrency: int = 3,
        **kwargs
    ) -> Dict[str, TranscriptResult]:
        """Get transcripts for multiple videos."""
        results = {}

        async def process_video(video_id: str):
            result = await self.get_transcript(video_id, **kwargs)
            results[video_id] = result

        # Process in batches
        for i in range(0, len(video_ids), concurrency):
            batch = video_ids[i:i + concurrency]
            await asyncio.gather(*[process_video(vid) for vid in batch])
            # Rate limiting between batches
            await asyncio.sleep(0.5)

        return results

    def get_stats(self) -> Dict[str, Any]:
        """Get extraction statistics."""
        return {
            "by_method": self.stats,
            "totals": {
                "attempts": sum(s["attempts"] for s in self.stats.values()),
                "successes": sum(s["successes"] for s in self.stats.values())
            },
            "supadata_available": self.supadata.available if self.supadata else False,
            "supadata_quota": self.supadata.quota_tracker.check_quota() if (
                self.supadata and self.supadata.quota_tracker
            ) else None
        }


async def main():
    """CLI for transcript orchestrator."""
    import argparse

    parser = argparse.ArgumentParser(description="Multi-level Transcript Extractor")
    parser.add_argument("video_id", help="YouTube video ID")
    parser.add_argument("--lang", default="en", help="Language code")
    parser.add_argument("--no-supadata", action="store_true", help="Disable Supadata fallback")
    parser.add_argument("--no-cache", action="store_true", help="Disable cache")
    parser.add_argument("--stats", action="store_true", help="Show stats after extraction")

    args = parser.parse_args()

    orchestrator = TranscriptOrchestrator(
        use_cache=not args.no_cache,
        use_supadata=not args.no_supadata
    )

    result = await orchestrator.get_transcript(args.video_id, languages=[args.lang])

    print(f"\n{'='*60}")
    print(f"Transcript Extraction: {args.video_id}")
    print(f"{'='*60}")
    print(f"Status:  {result.status.value}")
    print(f"Method:  {result.method.value if result.method else 'N/A'}")
    print(f"Time:    {result.extraction_time_ms}ms")

    if result.status == ExtractionStatus.SUCCESS:
        print(f"Segments: {len(result.segments)}")
        print(f"\nPreview:")
        print(f"  {result.full_text[:300]}...")
    elif result.error:
        print(f"Error:   {result.error}")

    if args.stats:
        print(f"\n{'='*60}")
        print(f"Statistics")
        print(f"{'='*60}")
        stats = orchestrator.get_stats()
        print(json.dumps(stats, indent=2))


if __name__ == "__main__":
    asyncio.run(main())
