#!/usr/bin/env python3
"""
GENESIS YOUTUBE TRANSCRIPT SYSTEM
===================================
Advanced YouTube MCP server access with failsafe multi-leveled deeply integrated
transcript capture for emerging tech and revenue strategy insights research.

Multi-Level Architecture:
    Level 1: MCP YouTube Server - Direct API integration
    Level 2: youtube-transcript-api - Python library
    Level 3: yt-dlp subtitles - Download and parse
    Level 4: Browser extraction - Playwright/Selenium
    Level 5: Whisper transcription - Audio extraction + STT
    Level 6: Manual captions cache - Previously extracted

Features:
    - Automatic level escalation on failure
    - Multi-language transcript support
    - Timestamp preservation
    - Batch processing for playlists/channels
    - Research indexing and search
    - Pattern detection across videos
    - Knowledge extraction and summarization

Usage:
    yt = YouTubeTranscripts()
    transcript = await yt.get_transcript("VIDEO_ID")
    insights = await yt.extract_insights(transcript)
"""

import json
import asyncio
import hashlib
import re
import os
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple, Generator
from enum import Enum
from urllib.parse import urlparse, parse_qs
import logging


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class TranscriptLevel(Enum):
    """Transcript extraction levels (priority order)."""
    MCP_SERVER = 1       # MCP YouTube server
    TRANSCRIPT_API = 2   # youtube-transcript-api
    YTDLP_SUBS = 3       # yt-dlp subtitle download
    BROWSER = 4          # Browser extraction
    WHISPER = 5          # Audio transcription
    CACHE = 6            # Cached transcripts


class TranscriptStatus(Enum):
    """Transcript extraction status."""
    SUCCESS = "success"
    NO_TRANSCRIPT = "no_transcript"
    VIDEO_UNAVAILABLE = "video_unavailable"
    PRIVATE = "private"
    AGE_RESTRICTED = "age_restricted"
    RATE_LIMITED = "rate_limited"
    ERROR = "error"


@dataclass
class TranscriptSegment:
    """A segment of transcript with timing."""
    text: str
    start: float  # Start time in seconds
    duration: float  # Duration in seconds
    language: str = "en"
    auto_generated: bool = False


@dataclass
class VideoTranscript:
    """Complete transcript for a video."""
    video_id: str
    title: Optional[str] = None
    channel: Optional[str] = None
    duration_seconds: int = 0
    segments: List[TranscriptSegment] = field(default_factory=list)
    language: str = "en"
    auto_generated: bool = False
    level_used: TranscriptLevel = TranscriptLevel.CACHE
    extracted_at: str = field(default_factory=lambda: datetime.now().isoformat())
    metadata: Dict[str, Any] = field(default_factory=dict)

    @property
    def full_text(self) -> str:
        """Get full transcript text."""
        return " ".join(seg.text for seg in self.segments)

    def get_text_at_time(self, seconds: float) -> Optional[str]:
        """Get text at specific timestamp."""
        for seg in self.segments:
            if seg.start <= seconds < seg.start + seg.duration:
                return seg.text
        return None


@dataclass
class TranscriptResult:
    """Result from transcript extraction attempt."""
    video_id: str
    status: TranscriptStatus
    transcript: Optional[VideoTranscript] = None
    error: Optional[str] = None
    duration_ms: int = 0


@dataclass
class VideoInsights:
    """Extracted insights from video."""
    video_id: str
    topics: List[str]
    key_phrases: List[str]
    summary: str
    action_items: List[str]
    mentioned_tools: List[str]
    mentioned_people: List[str]
    timestamp_highlights: List[Tuple[float, str]]  # (seconds, description)


class TranscriptBackend(ABC):
    """Abstract base class for transcript backends."""

    @abstractmethod
    def name(self) -> str:
        """Backend name."""
        pass

    @abstractmethod
    def level(self) -> TranscriptLevel:
        """Backend priority level."""
        pass

    @abstractmethod
    async def is_available(self) -> bool:
        """Check if backend is available."""
        pass

    @abstractmethod
    async def get_transcript(self, video_id: str, languages: List[str]) -> TranscriptResult:
        """Get transcript for video."""
        pass


class MCPServerBackend(TranscriptBackend):
    """MCP YouTube Server backend (Level 1)."""

    def __init__(self, server_url: str = "http://localhost:3000"):
        self.server_url = server_url
        self._available = None

    def name(self) -> str:
        return "MCP YouTube Server"

    def level(self) -> TranscriptLevel:
        return TranscriptLevel.MCP_SERVER

    async def is_available(self) -> bool:
        if self._available is None:
            try:
                import httpx
                async with httpx.AsyncClient(timeout=5.0) as client:
                    response = await client.get(f"{self.server_url}/health")
                    self._available = response.status_code == 200
            except Exception:
                self._available = False
        return self._available

    async def get_transcript(self, video_id: str, languages: List[str]) -> TranscriptResult:
        if not await self.is_available():
            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.ERROR,
                error="MCP server not available"
            )

        start = time.time()
        try:
            import httpx
            async with httpx.AsyncClient(timeout=30.0) as client:
                response = await client.post(
                    f"{self.server_url}/transcript",
                    json={
                        "video_id": video_id,
                        "languages": languages
                    }
                )

                if response.status_code != 200:
                    return TranscriptResult(
                        video_id=video_id,
                        status=TranscriptStatus.ERROR,
                        error=f"MCP server error: {response.status_code}",
                        duration_ms=int((time.time() - start) * 1000)
                    )

                data = response.json()
                segments = [
                    TranscriptSegment(
                        text=seg["text"],
                        start=seg["start"],
                        duration=seg.get("duration", 0),
                        language=data.get("language", "en")
                    )
                    for seg in data.get("segments", [])
                ]

                return TranscriptResult(
                    video_id=video_id,
                    status=TranscriptStatus.SUCCESS,
                    transcript=VideoTranscript(
                        video_id=video_id,
                        title=data.get("title"),
                        segments=segments,
                        language=data.get("language", "en"),
                        level_used=self.level()
                    ),
                    duration_ms=int((time.time() - start) * 1000)
                )

        except Exception as e:
            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.ERROR,
                error=str(e),
                duration_ms=int((time.time() - start) * 1000)
            )


class TranscriptAPIBackend(TranscriptBackend):
    """youtube-transcript-api backend (Level 2)."""

    def __init__(self):
        self._available = None

    def name(self) -> str:
        return "YouTube Transcript API"

    def level(self) -> TranscriptLevel:
        return TranscriptLevel.TRANSCRIPT_API

    async def is_available(self) -> bool:
        if self._available is None:
            try:
                from youtube_transcript_api import YouTubeTranscriptApi
                self._available = True
            except ImportError:
                self._available = False
        return self._available

    async def get_transcript(self, video_id: str, languages: List[str]) -> TranscriptResult:
        if not await self.is_available():
            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.ERROR,
                error="youtube-transcript-api not installed"
            )

        start = time.time()
        try:
            from youtube_transcript_api import YouTubeTranscriptApi
            from youtube_transcript_api._errors import (
                TranscriptsDisabled,
                NoTranscriptFound,
                VideoUnavailable
            )

            # Run in thread pool for async compatibility
            loop = asyncio.get_event_loop()
            transcript_data = await loop.run_in_executor(
                None,
                lambda: YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
            )

            segments = [
                TranscriptSegment(
                    text=seg["text"],
                    start=seg["start"],
                    duration=seg["duration"],
                    language=languages[0] if languages else "en"
                )
                for seg in transcript_data
            ]

            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.SUCCESS,
                transcript=VideoTranscript(
                    video_id=video_id,
                    segments=segments,
                    language=languages[0] if languages else "en",
                    level_used=self.level()
                ),
                duration_ms=int((time.time() - start) * 1000)
            )

        except Exception as e:
            error_str = str(e).lower()

            if "transcript" in error_str and "disabled" in error_str:
                status = TranscriptStatus.NO_TRANSCRIPT
            elif "unavailable" in error_str:
                status = TranscriptStatus.VIDEO_UNAVAILABLE
            elif "private" in error_str:
                status = TranscriptStatus.PRIVATE
            else:
                status = TranscriptStatus.ERROR

            return TranscriptResult(
                video_id=video_id,
                status=status,
                error=str(e),
                duration_ms=int((time.time() - start) * 1000)
            )


class YTDLPBackend(TranscriptBackend):
    """yt-dlp subtitle download backend (Level 3)."""

    def __init__(self, download_dir: Optional[Path] = None):
        self.download_dir = download_dir or Path("/mnt/e/genesis-system/data/youtube_subs")
        self.download_dir.mkdir(parents=True, exist_ok=True)
        self._available = None

    def name(self) -> str:
        return "yt-dlp Subtitles"

    def level(self) -> TranscriptLevel:
        return TranscriptLevel.YTDLP_SUBS

    async def is_available(self) -> bool:
        if self._available is None:
            try:
                import yt_dlp
                self._available = True
            except ImportError:
                # Try command line
                try:
                    result = await asyncio.create_subprocess_exec(
                        "yt-dlp", "--version",
                        stdout=asyncio.subprocess.PIPE,
                        stderr=asyncio.subprocess.PIPE
                    )
                    await result.communicate()
                    self._available = result.returncode == 0
                except Exception:
                    self._available = False
        return self._available

    async def get_transcript(self, video_id: str, languages: List[str]) -> TranscriptResult:
        if not await self.is_available():
            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.ERROR,
                error="yt-dlp not available"
            )

        start = time.time()
        try:
            import yt_dlp

            url = f"https://www.youtube.com/watch?v={video_id}"
            output_template = str(self.download_dir / f"{video_id}.%(ext)s")

            ydl_opts = {
                "writesubtitles": True,
                "writeautomaticsub": True,
                "subtitleslangs": languages,
                "skip_download": True,
                "outtmpl": output_template,
                "quiet": True,
                "no_warnings": True
            }

            loop = asyncio.get_event_loop()
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                await loop.run_in_executor(None, lambda: ydl.download([url]))

            # Find and parse subtitle file
            sub_file = None
            for lang in languages + ["en"]:
                for ext in [".vtt", ".srt", ".json3"]:
                    candidate = self.download_dir / f"{video_id}.{lang}{ext}"
                    if candidate.exists():
                        sub_file = candidate
                        break
                if sub_file:
                    break

            if not sub_file:
                return TranscriptResult(
                    video_id=video_id,
                    status=TranscriptStatus.NO_TRANSCRIPT,
                    error="No subtitle file found",
                    duration_ms=int((time.time() - start) * 1000)
                )

            # Parse subtitle file
            segments = self._parse_subtitle_file(sub_file)

            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.SUCCESS,
                transcript=VideoTranscript(
                    video_id=video_id,
                    segments=segments,
                    language=languages[0] if languages else "en",
                    level_used=self.level()
                ),
                duration_ms=int((time.time() - start) * 1000)
            )

        except Exception as e:
            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.ERROR,
                error=str(e),
                duration_ms=int((time.time() - start) * 1000)
            )

    def _parse_subtitle_file(self, file_path: Path) -> List[TranscriptSegment]:
        """Parse VTT or SRT subtitle file."""
        segments = []

        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Parse VTT format - YouTube uses complex VTT with TTML tags
        if file_path.suffix == ".vtt":
            # Method 1: Try to extract clean text by removing all formatting
            lines = content.split('\n')
            text_lines = []
            current_text = None
            current_start = 0.0

            for i, line in enumerate(lines):
                line_stripped = line.strip()

                # Skip headers and empty lines
                if not line_stripped or line_stripped == 'WEBVTT':
                    continue
                if line_stripped.startswith('Kind:') or line_stripped.startswith('Language:'):
                    continue
                if line_stripped.startswith('NOTE'):
                    continue

                # Extract timestamp from timestamp lines
                if '-->' in line_stripped:
                    # Parse: "00:00:01.189 --> 00:00:01.199 align:start position:0%"
                    ts_match = re.match(r'(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->', line_stripped)
                    if ts_match:
                        current_start = self._parse_timestamp(ts_match.group(1))
                    continue

                # Skip numeric-only lines (cue identifiers)
                if line_stripped.isdigit():
                    continue

                # Clean TTML/HTML tags from text
                clean_text = re.sub(r'<[^>]+>', '', line_stripped)
                clean_text = clean_text.strip()

                # Skip duplicates (YouTube VTT has duplicate lines for karaoke effect)
                if clean_text and clean_text != current_text and len(clean_text) > 1:
                    segments.append(TranscriptSegment(
                        text=clean_text,
                        start=current_start,
                        duration=3.0  # Default duration
                    ))
                    current_text = clean_text

        # Parse SRT format
        elif file_path.suffix == ".srt":
            pattern = r'\d+\n(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})\n(.*?)(?=\n\n|\Z)'
            matches = re.findall(pattern, content, re.DOTALL)

            for start_str, end_str, text in matches:
                start_str = start_str.replace(',', '.')
                end_str = end_str.replace(',', '.')
                start = self._parse_timestamp(start_str)
                end = self._parse_timestamp(end_str)

                if text.strip():
                    segments.append(TranscriptSegment(
                        text=text.strip(),
                        start=start,
                        duration=end - start
                    ))

        return segments

    def _parse_timestamp(self, ts: str) -> float:
        """Parse timestamp to seconds."""
        parts = ts.split(':')
        if len(parts) == 3:
            h, m, s = parts
            seconds = float(h) * 3600 + float(m) * 60 + float(s)
        elif len(parts) == 2:
            m, s = parts
            seconds = float(m) * 60 + float(s)
        else:
            seconds = float(parts[0])
        return seconds


class CacheBackend(TranscriptBackend):
    """Cached transcripts backend (Level 6)."""

    def __init__(self, cache_dir: Optional[Path] = None):
        self.cache_dir = cache_dir or Path("/mnt/e/genesis-system/data/transcript_cache")
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def name(self) -> str:
        return "Transcript Cache"

    def level(self) -> TranscriptLevel:
        return TranscriptLevel.CACHE

    async def is_available(self) -> bool:
        return True  # Always available

    async def get_transcript(self, video_id: str, languages: List[str]) -> TranscriptResult:
        cache_file = self.cache_dir / f"{video_id}.json"

        if not cache_file.exists():
            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.NO_TRANSCRIPT,
                error="Not in cache"
            )

        try:
            with open(cache_file, 'r') as f:
                data = json.load(f)

            segments = [
                TranscriptSegment(**seg)
                for seg in data.get("segments", [])
            ]

            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.SUCCESS,
                transcript=VideoTranscript(
                    video_id=video_id,
                    title=data.get("title"),
                    channel=data.get("channel"),
                    segments=segments,
                    language=data.get("language", "en"),
                    level_used=self.level(),
                    metadata={"cached": True, "cached_at": data.get("cached_at")}
                )
            )

        except Exception as e:
            return TranscriptResult(
                video_id=video_id,
                status=TranscriptStatus.ERROR,
                error=str(e)
            )

    def save_to_cache(self, transcript: VideoTranscript) -> bool:
        """Save transcript to cache."""
        try:
            cache_file = self.cache_dir / f"{transcript.video_id}.json"
            data = {
                "video_id": transcript.video_id,
                "title": transcript.title,
                "channel": transcript.channel,
                "language": transcript.language,
                "segments": [
                    {
                        "text": seg.text,
                        "start": seg.start,
                        "duration": seg.duration,
                        "language": seg.language
                    }
                    for seg in transcript.segments
                ],
                "cached_at": datetime.now().isoformat()
            }
            with open(cache_file, 'w') as f:
                json.dump(data, f, indent=2)
            return True
        except Exception as e:
            logger.error(f"Cache save failed: {e}")
            return False


class YouTubeTranscripts:
    """
    Multi-level failsafe YouTube transcript extraction system.

    Provides automatic fallback across multiple extraction methods,
    with caching, indexing, and insight extraction.
    """

    def __init__(
        self,
        storage_dir: Optional[Path] = None,
        mcp_server_url: str = "http://localhost:3000"
    ):
        self.storage_dir = storage_dir or Path("/mnt/e/genesis-system/data/youtube")
        self.storage_dir.mkdir(parents=True, exist_ok=True)

        # Initialize cache backend first (for caching results)
        self._cache = CacheBackend(self.storage_dir / "cache")

        # Initialize backends in priority order
        self._backends: List[TranscriptBackend] = [
            MCPServerBackend(mcp_server_url),
            TranscriptAPIBackend(),
            YTDLPBackend(self.storage_dir / "downloads"),
            self._cache  # Cache as last resort
        ]

        # Research index
        self._index: Dict[str, Dict] = {}
        self._load_index()

        # Statistics
        self._stats = {
            "total_extractions": 0,
            "successful_extractions": 0,
            "by_level": {level.name: 0 for level in TranscriptLevel},
            "cached_hits": 0,
            "errors": 0
        }

    def _load_index(self):
        """Load transcript index from disk."""
        index_file = self.storage_dir / "transcript_index.json"
        if index_file.exists():
            try:
                with open(index_file, 'r') as f:
                    self._index = json.load(f)
            except Exception as e:
                logger.warning(f"Failed to load index: {e}")

    def _save_index(self):
        """Save transcript index to disk."""
        index_file = self.storage_dir / "transcript_index.json"
        with open(index_file, 'w') as f:
            json.dump(self._index, f, indent=2)

    @staticmethod
    def extract_video_id(url_or_id: str) -> Optional[str]:
        """Extract video ID from URL or return as-is."""
        # Already an ID
        if re.match(r'^[a-zA-Z0-9_-]{11}$', url_or_id):
            return url_or_id

        # Parse URL
        parsed = urlparse(url_or_id)

        # youtube.com/watch?v=ID
        if 'youtube.com' in parsed.netloc:
            query = parse_qs(parsed.query)
            if 'v' in query:
                return query['v'][0]

        # youtu.be/ID
        if 'youtu.be' in parsed.netloc:
            return parsed.path.lstrip('/')

        return None

    async def get_available_backends(self) -> List[str]:
        """Get list of available backends."""
        available = []
        for backend in self._backends:
            if await backend.is_available():
                available.append(f"{backend.name()} (Level {backend.level().value})")
        return available

    async def get_transcript(
        self,
        url_or_id: str,
        languages: List[str] = None,
        use_cache: bool = True,
        save_to_cache: bool = True
    ) -> TranscriptResult:
        """
        Get transcript with multi-level fallback.

        Args:
            url_or_id: YouTube URL or video ID
            languages: Preferred languages (default: ["en"])
            use_cache: Check cache first
            save_to_cache: Save result to cache

        Returns:
            TranscriptResult with transcript or error info
        """
        video_id = self.extract_video_id(url_or_id)
        if not video_id:
            return TranscriptResult(
                video_id=url_or_id,
                status=TranscriptStatus.ERROR,
                error="Invalid video ID or URL"
            )

        languages = languages or ["en"]
        self._stats["total_extractions"] += 1

        # Check cache first if enabled
        if use_cache:
            cache_result = await self._cache.get_transcript(video_id, languages)
            if cache_result.status == TranscriptStatus.SUCCESS:
                self._stats["cached_hits"] += 1
                self._stats["by_level"][TranscriptLevel.CACHE.name] += 1
                return cache_result

        # Try each backend in priority order
        last_result = None
        for backend in self._backends:
            if backend == self._cache:  # Skip cache in main loop
                continue

            if not await backend.is_available():
                continue

            try:
                logger.info(f"Trying {backend.name()} for {video_id}")
                result = await backend.get_transcript(video_id, languages)

                if result.status == TranscriptStatus.SUCCESS:
                    self._stats["successful_extractions"] += 1
                    self._stats["by_level"][result.transcript.level_used.name] += 1

                    # Save to cache
                    if save_to_cache and result.transcript:
                        self._cache.save_to_cache(result.transcript)

                    # Update index
                    self._update_index(result.transcript)

                    return result

                last_result = result

                # Don't retry on these statuses
                if result.status in [TranscriptStatus.VIDEO_UNAVAILABLE,
                                     TranscriptStatus.PRIVATE,
                                     TranscriptStatus.NO_TRANSCRIPT]:
                    break

            except Exception as e:
                logger.warning(f"{backend.name()} error: {e}")
                self._stats["errors"] += 1
                last_result = TranscriptResult(
                    video_id=video_id,
                    status=TranscriptStatus.ERROR,
                    error=str(e)
                )

        return last_result or TranscriptResult(
            video_id=video_id,
            status=TranscriptStatus.ERROR,
            error="All backends failed"
        )

    async def get_batch_transcripts(
        self,
        video_ids: List[str],
        concurrency: int = 3,
        **kwargs
    ) -> Dict[str, TranscriptResult]:
        """Get transcripts for multiple videos."""
        results = {}

        async def process_video(video_id: str):
            result = await self.get_transcript(video_id, **kwargs)
            results[video_id] = result

        # Process in batches
        for i in range(0, len(video_ids), concurrency):
            batch = video_ids[i:i + concurrency]
            await asyncio.gather(*[process_video(vid) for vid in batch])

            # Small delay between batches
            await asyncio.sleep(0.5)

        return results

    def _update_index(self, transcript: VideoTranscript):
        """Update transcript index with video metadata."""
        self._index[transcript.video_id] = {
            "title": transcript.title,
            "channel": transcript.channel,
            "language": transcript.language,
            "segment_count": len(transcript.segments),
            "duration_seconds": transcript.duration_seconds,
            "extracted_at": transcript.extracted_at,
            "level_used": transcript.level_used.name
        }
        self._save_index()

    def extract_insights(self, transcript: VideoTranscript) -> VideoInsights:
        """Extract insights from transcript."""
        full_text = transcript.full_text

        # Extract topics
        topics = self._extract_topics(full_text)

        # Extract key phrases
        key_phrases = self._extract_key_phrases(full_text)

        # Generate summary
        summary = self._generate_summary(full_text)

        # Extract action items
        action_items = self._extract_action_items(full_text)

        # Extract mentioned tools/technologies
        tools = self._extract_tools(full_text)

        # Extract mentioned people
        people = self._extract_people(full_text)

        # Find timestamp highlights
        highlights = self._find_highlights(transcript)

        return VideoInsights(
            video_id=transcript.video_id,
            topics=topics,
            key_phrases=key_phrases,
            summary=summary,
            action_items=action_items,
            mentioned_tools=tools,
            mentioned_people=people,
            timestamp_highlights=highlights
        )

    def _extract_topics(self, text: str) -> List[str]:
        """Extract main topics from text."""
        # Technical keywords that indicate topics
        tech_keywords = {
            'ai', 'artificial intelligence', 'machine learning', 'deep learning',
            'llm', 'gpt', 'claude', 'gemini', 'openai', 'anthropic',
            'agent', 'automation', 'workflow', 'pipeline',
            'api', 'integration', 'saas', 'startup',
            'revenue', 'growth', 'marketing', 'sales',
            'voice', 'audio', 'video', 'content',
            'python', 'javascript', 'typescript', 'react',
            'database', 'vector', 'embedding', 'rag'
        }

        text_lower = text.lower()
        found = []

        for keyword in tech_keywords:
            if keyword in text_lower:
                found.append(keyword)

        return found[:10]  # Top 10 topics

    def _extract_key_phrases(self, text: str) -> List[str]:
        """Extract key phrases from text."""
        # Simple n-gram extraction
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
        bigrams = [' '.join(words[i:i+2]) for i in range(len(words)-1)]

        # Count frequencies
        from collections import Counter
        phrase_counts = Counter(bigrams)

        # Filter common phrases
        common = {'the the', 'and the', 'to the', 'of the', 'in the'}
        top_phrases = [
            phrase for phrase, count in phrase_counts.most_common(20)
            if phrase not in common and count > 2
        ]

        return top_phrases[:10]

    def _generate_summary(self, text: str) -> str:
        """Generate brief summary of text."""
        # Simple extractive summary - first few sentences
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]

        if not sentences:
            return "No summary available"

        # Take first 3 sentences
        summary = '. '.join(sentences[:3]) + '.'
        return summary[:500]  # Limit length

    def _extract_action_items(self, text: str) -> List[str]:
        """Extract action items from text."""
        patterns = [
            r'(?:you should|you need to|make sure to|don\'t forget to)\s+([^.!?]+)',
            r'(?:step \d+[:\s]+)([^.!?]+)',
            r'(?:first|second|third|then|next|finally)[,:\s]+([^.!?]+)'
        ]

        actions = []
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            actions.extend(matches)

        return actions[:10]

    def _extract_tools(self, text: str) -> List[str]:
        """Extract mentioned tools and technologies."""
        tools = {
            'openai', 'anthropic', 'claude', 'gpt', 'gemini', 'llama',
            'langchain', 'llamaindex', 'autogen', 'crewai',
            'python', 'javascript', 'typescript', 'react', 'nextjs',
            'postgresql', 'mongodb', 'redis', 'qdrant', 'pinecone',
            'aws', 'gcp', 'azure', 'vercel', 'railway',
            'stripe', 'twilio', 'sendgrid', 'zapier', 'n8n',
            'github', 'vscode', 'cursor', 'copilot',
            'notion', 'slack', 'discord', 'linear'
        }

        text_lower = text.lower()
        found = [tool for tool in tools if tool in text_lower]
        return found

    def _extract_people(self, text: str) -> List[str]:
        """Extract mentioned people names."""
        # Simple proper noun extraction
        pattern = r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b'
        names = re.findall(pattern, text)

        # Filter common false positives
        false_positives = {'New York', 'San Francisco', 'United States', 'Open Source'}
        names = [n for n in names if n not in false_positives]

        return list(set(names))[:10]

    def _find_highlights(self, transcript: VideoTranscript) -> List[Tuple[float, str]]:
        """Find timestamp highlights (key moments)."""
        highlights = []

        highlight_markers = [
            'important', 'key point', 'remember', 'crucial', 'critical',
            'here\'s the trick', 'pro tip', 'secret', 'hack'
        ]

        for seg in transcript.segments:
            text_lower = seg.text.lower()
            for marker in highlight_markers:
                if marker in text_lower:
                    highlights.append((seg.start, seg.text[:100]))
                    break

        return highlights[:10]

    def search_transcripts(self, query: str, limit: int = 10) -> List[Dict]:
        """Search across all indexed transcripts."""
        results = []
        query_lower = query.lower()

        for video_id, meta in self._index.items():
            # Check cache for full transcript
            cache_file = self._cache.cache_dir / f"{video_id}.json"
            if not cache_file.exists():
                continue

            try:
                with open(cache_file, 'r') as f:
                    data = json.load(f)

                full_text = ' '.join(seg['text'] for seg in data.get('segments', []))

                if query_lower in full_text.lower():
                    # Find matching segments
                    matching = [
                        seg for seg in data.get('segments', [])
                        if query_lower in seg['text'].lower()
                    ]

                    results.append({
                        "video_id": video_id,
                        "title": data.get('title'),
                        "matches": len(matching),
                        "first_match": matching[0] if matching else None
                    })

            except Exception:
                continue

        # Sort by match count
        results.sort(key=lambda x: x['matches'], reverse=True)
        return results[:limit]

    def get_stats(self) -> Dict[str, Any]:
        """Get system statistics."""
        return {
            **self._stats,
            "indexed_videos": len(self._index),
            "cache_size": len(list(self._cache.cache_dir.glob("*.json")))
        }


async def main():
    """CLI for YouTube Transcripts system."""
    import argparse

    parser = argparse.ArgumentParser(description="Genesis YouTube Transcript System")
    parser.add_argument("command", choices=["transcript", "batch", "search", "insights", "backends", "stats"])
    parser.add_argument("--video", help="Video URL or ID")
    parser.add_argument("--videos", nargs="+", help="Multiple video IDs")
    parser.add_argument("--query", help="Search query")
    parser.add_argument("--lang", default="en", help="Language code")
    args = parser.parse_args()

    yt = YouTubeTranscripts()

    if args.command == "backends":
        backends = await yt.get_available_backends()
        print("Available Transcript Backends:")
        print("=" * 40)
        for b in backends:
            print(f"  - {b}")

    elif args.command == "transcript":
        if not args.video:
            print("Usage: --video VIDEO_URL_OR_ID")
            return

        print(f"Extracting transcript for: {args.video}")
        result = await yt.get_transcript(args.video, languages=[args.lang])

        print(f"\nStatus: {result.status.value}")
        if result.transcript:
            print(f"Level: {result.transcript.level_used.name}")
            print(f"Segments: {len(result.transcript.segments)}")
            print(f"\nPreview:\n{result.transcript.full_text[:500]}...")
        elif result.error:
            print(f"Error: {result.error}")

    elif args.command == "batch":
        if not args.videos:
            print("Usage: --videos VIDEO_ID1 VIDEO_ID2 ...")
            return

        print(f"Processing {len(args.videos)} videos...")
        results = await yt.get_batch_transcripts(args.videos)

        for vid, result in results.items():
            status = "✓" if result.status == TranscriptStatus.SUCCESS else "✗"
            print(f"  {status} {vid}: {result.status.value}")

    elif args.command == "search":
        if not args.query:
            print("Usage: --query SEARCH_TERMS")
            return

        results = yt.search_transcripts(args.query)
        print(f"Search results for '{args.query}':")
        print("=" * 40)
        for r in results:
            print(f"  [{r['video_id']}] {r['title']} ({r['matches']} matches)")

    elif args.command == "insights":
        if not args.video:
            print("Usage: --video VIDEO_URL_OR_ID")
            return

        result = await yt.get_transcript(args.video)
        if result.transcript:
            insights = yt.extract_insights(result.transcript)
            print(f"\nInsights for {args.video}:")
            print("=" * 40)
            print(f"Topics: {', '.join(insights.topics)}")
            print(f"Tools: {', '.join(insights.mentioned_tools)}")
            print(f"\nSummary:\n{insights.summary}")
            print(f"\nAction Items:")
            for item in insights.action_items[:5]:
                print(f"  - {item}")
        else:
            print(f"Failed to get transcript: {result.error}")

    elif args.command == "stats":
        stats = yt.get_stats()
        print("YouTube Transcript Statistics:")
        print("=" * 40)
        print(json.dumps(stats, indent=2))


if __name__ == "__main__":
    asyncio.run(main())
