#!/usr/bin/env python3
"""
YouTube Intelligence Engine
============================
Genesis core module for extracting knowledge from YouTube videos.

Capabilities:
- Extract transcripts via youtube-transcript-api (NO API key required)
- Fetch video metadata via yt-dlp (NO API key required)
- Batch process entire channels or playlists
- Single video, playlist, channel, or search-term input
- Output: structured transcript chunks + metadata JSON
- Feeds into youtube_kg_builder.py for KG entity extraction

Dependencies (all installed):
    youtube-transcript-api  — transcript extraction, no auth
    yt-dlp                  — metadata, channel/playlist enumeration
    requests                — fallback HTTP

Storage:
    Raw transcripts:  E:/genesis-system/data/youtube_knowledge_base/raw/{video_id}.json
    Metadata:         E:/genesis-system/data/youtube_knowledge_base/metadata/{video_id}.json
    Processed:        E:/genesis-system/data/youtube_knowledge_base/processed/{video_id}.md

NO SQLite. All outputs are flat JSON/JSONL files per Genesis rules.

Usage (CLI):
    python youtube_intelligence.py --video "https://youtu.be/VIDEO_ID"
    python youtube_intelligence.py --channel "@NickPontesOfficial" --limit 10
    python youtube_intelligence.py --playlist "https://youtube.com/playlist?list=PLxxx"
    python youtube_intelligence.py --video-id "abc123def456"

Usage (Python API):
    from core.youtube_intelligence import YouTubeIntelligence
    yt = YouTubeIntelligence()
    result = yt.process_video("https://youtu.be/VIDEO_ID")
    results = yt.process_channel("@NickPontesOfficial", limit=5)
"""

import json
import re
import sys
import subprocess
import argparse
from pathlib import Path
from datetime import datetime
from typing import Optional, Dict, Any, List, Tuple

# ── Dependency checks ──────────────────────────────────────────────────────────
try:
    from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
    TRANSCRIPT_API_AVAILABLE = True
except ImportError:
    TRANSCRIPT_API_AVAILABLE = False
    print("[WARN] youtube-transcript-api not installed. Run: pip install youtube-transcript-api")

# ── Constants ──────────────────────────────────────────────────────────────────
BASE_PATH = Path("E:/genesis-system")
RAW_DIR = BASE_PATH / "data" / "youtube_knowledge_base" / "raw"
META_DIR = BASE_PATH / "data" / "youtube_knowledge_base" / "metadata"
PROCESSED_DIR = BASE_PATH / "data" / "youtube_knowledge_base" / "processed"
KG_DIR = BASE_PATH / "KNOWLEDGE_GRAPH" / "entities"

# Priority channels for Genesis intelligence gathering
PRIORITY_CHANNELS = {
    "nick_pontes": {
        "handle": "@NickPontesOfficial",
        "rationale": "GHL agency mastery — snapshots, SaaS mode, pricing, sales tactics",
        "category": "ghl_agency",
        "priority": "critical"
    },
    "ghl_official": {
        "handle": "@GoHighLevel",
        "rationale": "Platform updates, new features, official product direction",
        "category": "ghl_platform",
        "priority": "high"
    },
    "shaun_clark": {
        "handle": "@ShaunClarkGHL",
        "rationale": "GHL co-founder — product roadmap, insider tips",
        "category": "ghl_platform",
        "priority": "high"
    },
    "tradie_success_au": {
        "handle": "@TradieSuccess",
        "rationale": "Australian tradie business — pain points, language, buyer psychology",
        "category": "tradie_market",
        "priority": "high"
    },
    "receptionist_ai_industry": {
        "handle": "@VAPIai",
        "rationale": "Voice agent industry — competitor analysis, feature benchmarking",
        "category": "voice_ai",
        "priority": "medium"
    }
}


class YouTubeIntelligence:
    """
    Core YouTube intelligence engine for Genesis.
    Extracts transcripts and metadata without requiring any API keys.
    """

    def __init__(self, base_path: Optional[str] = None):
        self.base = Path(base_path) if base_path else BASE_PATH
        self.raw_dir = self.base / "data" / "youtube_knowledge_base" / "raw"
        self.meta_dir = self.base / "data" / "youtube_knowledge_base" / "metadata"
        self.processed_dir = self.base / "data" / "youtube_knowledge_base" / "processed"
        self._ensure_dirs()

    def _ensure_dirs(self):
        """Create storage directories if missing."""
        for d in [self.raw_dir, self.meta_dir, self.processed_dir]:
            d.mkdir(parents=True, exist_ok=True)

    def _log(self, msg: str):
        ts = datetime.now().strftime("%H:%M:%S")
        print(f"[{ts}] {msg}")

    # ── Video ID extraction ────────────────────────────────────────────────────

    def extract_video_id(self, url_or_id: str) -> Optional[str]:
        """Extract YouTube video ID from any URL format or raw ID."""
        # Already a bare ID (11 chars, alphanumeric + dash + underscore)
        if re.match(r'^[a-zA-Z0-9_-]{11}$', url_or_id.strip()):
            return url_or_id.strip()

        patterns = [
            r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})',
            r'youtube\.com/v/([a-zA-Z0-9_-]{11})',
        ]
        for pattern in patterns:
            m = re.search(pattern, url_or_id)
            if m:
                return m.group(1)
        return None

    # ── Transcript extraction ──────────────────────────────────────────────────

    def get_transcript(self, video_id: str, lang: str = "en") -> Dict[str, Any]:
        """
        Fetch transcript for a video using youtube-transcript-api.
        No API key required. Falls back to auto-generated captions if manual unavailable.

        Returns dict with:
            video_id, transcript (list of {text, start, duration}), full_text, word_count,
            duration_seconds, lang, source (manual|auto), fetched_at
        """
        if not TRANSCRIPT_API_AVAILABLE:
            return {"error": "youtube-transcript-api not installed", "video_id": video_id}

        self._log(f"Fetching transcript for {video_id} ...")

        try:
            # Try preferred language first, then any available
            try:
                transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

                # Prefer manual (human) transcripts over auto-generated
                transcript = None
                try:
                    transcript = transcript_list.find_manually_created_transcript([lang, "en"])
                    source = "manual"
                except Exception:
                    pass

                if transcript is None:
                    try:
                        transcript = transcript_list.find_generated_transcript([lang, "en"])
                        source = "auto_generated"
                    except Exception:
                        # Last resort: take whatever is available
                        available = list(transcript_list)
                        if available:
                            transcript = available[0]
                            source = "auto_generated"

                if transcript is None:
                    return {"error": "No transcript available", "video_id": video_id}

                chunks = transcript.fetch()

            except TranscriptsDisabled:
                return {"error": "Transcripts disabled for this video", "video_id": video_id}
            except NoTranscriptFound:
                return {"error": "No transcript found", "video_id": video_id}

            # Build full text and calculate duration
            full_text_parts = [c["text"] for c in chunks]
            full_text = " ".join(full_text_parts)
            # Clean up common transcript artifacts
            full_text = re.sub(r'\[.*?\]', '', full_text)  # Remove [Music], [Applause] etc
            full_text = re.sub(r'\s+', ' ', full_text).strip()

            total_duration = 0
            if chunks:
                last = chunks[-1]
                total_duration = int(last.get("start", 0) + last.get("duration", 0))

            result = {
                "video_id": video_id,
                "transcript": chunks,
                "full_text": full_text,
                "word_count": len(full_text.split()),
                "chunk_count": len(chunks),
                "duration_seconds": total_duration,
                "duration_formatted": self._format_duration(total_duration),
                "lang": lang,
                "source": source,
                "fetched_at": datetime.utcnow().isoformat() + "Z"
            }

            self._log(f"  Got {len(chunks)} chunks, {result['word_count']} words, {result['duration_formatted']}")
            return result

        except Exception as e:
            self._log(f"  ERROR: {e}")
            return {"error": str(e), "video_id": video_id}

    def _format_duration(self, seconds: int) -> str:
        """Convert seconds to HH:MM:SS or MM:SS string."""
        h = seconds // 3600
        m = (seconds % 3600) // 60
        s = seconds % 60
        if h > 0:
            return f"{h}:{m:02d}:{s:02d}"
        return f"{m}:{s:02d}"

    # ── Metadata extraction ────────────────────────────────────────────────────

    def get_metadata(self, video_id: str) -> Dict[str, Any]:
        """
        Fetch video metadata using yt-dlp (no API key required).
        Returns: title, channel, upload_date, view_count, like_count, description,
                 tags, duration, thumbnail_url, url
        """
        url = f"https://www.youtube.com/watch?v={video_id}"
        self._log(f"Fetching metadata for {video_id} ...")

        try:
            cmd = [
                "yt-dlp",
                "--dump-json",
                "--no-download",
                "--quiet",
                "--no-warnings",
                url
            ]
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)

            if result.returncode != 0:
                self._log(f"  yt-dlp error: {result.stderr[:200]}")
                return {"error": result.stderr[:200], "video_id": video_id}

            data = json.loads(result.stdout)

            metadata = {
                "video_id": video_id,
                "url": f"https://www.youtube.com/watch?v={video_id}",
                "title": data.get("title", ""),
                "channel": data.get("uploader", data.get("channel", "")),
                "channel_id": data.get("channel_id", ""),
                "upload_date": data.get("upload_date", ""),  # YYYYMMDD format
                "upload_date_iso": self._parse_date(data.get("upload_date", "")),
                "view_count": data.get("view_count", 0),
                "like_count": data.get("like_count", 0),
                "duration_seconds": data.get("duration", 0),
                "duration_formatted": self._format_duration(data.get("duration", 0)),
                "description": (data.get("description", "") or "")[:2000],  # Cap at 2000 chars
                "tags": data.get("tags", [])[:20],  # Cap at 20 tags
                "thumbnail_url": data.get("thumbnail", ""),
                "categories": data.get("categories", []),
                "fetched_at": datetime.utcnow().isoformat() + "Z"
            }

            self._log(f"  Title: {metadata['title'][:60]}")
            return metadata

        except subprocess.TimeoutExpired:
            return {"error": "Metadata fetch timed out", "video_id": video_id}
        except json.JSONDecodeError as e:
            return {"error": f"JSON parse error: {e}", "video_id": video_id}
        except Exception as e:
            return {"error": str(e), "video_id": video_id}

    def _parse_date(self, yyyymmdd: str) -> str:
        """Convert YYYYMMDD to ISO date string."""
        if len(yyyymmdd) == 8:
            return f"{yyyymmdd[:4]}-{yyyymmdd[4:6]}-{yyyymmdd[6:8]}"
        return yyyymmdd

    # ── Channel/Playlist enumeration ───────────────────────────────────────────

    def get_channel_videos(self, channel: str, limit: int = 10) -> List[Dict[str, Any]]:
        """
        Enumerate most recent videos from a YouTube channel.
        channel: handle (@Username), channel URL, or channel ID
        Returns list of {video_id, title, url, upload_date, duration}
        """
        self._log(f"Enumerating channel: {channel} (limit={limit}) ...")

        # Normalize channel URL
        if channel.startswith("@"):
            url = f"https://www.youtube.com/{channel}/videos"
        elif "youtube.com" in channel:
            url = channel if "/videos" in channel else channel + "/videos"
        else:
            url = f"https://www.youtube.com/channel/{channel}/videos"

        try:
            cmd = [
                "yt-dlp",
                "--flat-playlist",
                "--dump-json",
                "--quiet",
                "--no-warnings",
                f"--playlist-end={limit}",
                url
            ]
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)

            if result.returncode != 0:
                self._log(f"  Error: {result.stderr[:200]}")
                return []

            videos = []
            for line in result.stdout.strip().split("\n"):
                if not line.strip():
                    continue
                try:
                    data = json.loads(line)
                    vid_id = data.get("id", "")
                    if vid_id:
                        videos.append({
                            "video_id": vid_id,
                            "title": data.get("title", ""),
                            "url": f"https://www.youtube.com/watch?v={vid_id}",
                            "upload_date": data.get("upload_date", ""),
                            "duration": data.get("duration", 0),
                            "view_count": data.get("view_count", 0)
                        })
                except json.JSONDecodeError:
                    continue

            self._log(f"  Found {len(videos)} videos")
            return videos

        except subprocess.TimeoutExpired:
            self._log("  Channel enumeration timed out")
            return []
        except Exception as e:
            self._log(f"  Error: {e}")
            return []

    def get_playlist_videos(self, playlist_url: str, limit: int = 50) -> List[Dict[str, Any]]:
        """Enumerate videos from a YouTube playlist. Same output format as get_channel_videos."""
        self._log(f"Enumerating playlist (limit={limit}) ...")

        try:
            cmd = [
                "yt-dlp",
                "--flat-playlist",
                "--dump-json",
                "--quiet",
                "--no-warnings",
                f"--playlist-end={limit}",
                playlist_url
            ]
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)

            videos = []
            for line in result.stdout.strip().split("\n"):
                if not line.strip():
                    continue
                try:
                    data = json.loads(line)
                    vid_id = data.get("id", "")
                    if vid_id:
                        videos.append({
                            "video_id": vid_id,
                            "title": data.get("title", ""),
                            "url": f"https://www.youtube.com/watch?v={vid_id}",
                            "upload_date": data.get("upload_date", ""),
                            "duration": data.get("duration", 0),
                        })
                except json.JSONDecodeError:
                    continue

            self._log(f"  Found {len(videos)} videos in playlist")
            return videos

        except Exception as e:
            self._log(f"  Error: {e}")
            return []

    # ── Full video processing ──────────────────────────────────────────────────

    def process_video(
        self,
        url_or_id: str,
        save: bool = True,
        skip_if_exists: bool = True
    ) -> Dict[str, Any]:
        """
        Full processing pipeline for a single video:
        1. Extract video ID
        2. Check if already processed (skip_if_exists)
        3. Fetch metadata via yt-dlp
        4. Fetch transcript via youtube-transcript-api
        5. Save raw JSON to disk
        6. Return combined result

        Returns combined dict with metadata + transcript.
        """
        video_id = self.extract_video_id(url_or_id)
        if not video_id:
            self._log(f"Could not extract video ID from: {url_or_id}")
            return {"error": f"Invalid URL or video ID: {url_or_id}"}

        # Check cache
        raw_path = self.raw_dir / f"{video_id}.json"
        if skip_if_exists and raw_path.exists():
            self._log(f"  {video_id}: already cached, loading from disk")
            with open(raw_path) as f:
                return json.load(f)

        # Fetch metadata + transcript in sequence
        metadata = self.get_metadata(video_id)
        transcript_data = self.get_transcript(video_id)

        # Merge into unified result
        result = {
            **metadata,
            "transcript": transcript_data.get("transcript", []),
            "full_text": transcript_data.get("full_text", ""),
            "word_count": transcript_data.get("word_count", 0),
            "chunk_count": transcript_data.get("chunk_count", 0),
            "transcript_source": transcript_data.get("source", "unknown"),
            "transcript_error": transcript_data.get("error"),
            "processed_at": datetime.utcnow().isoformat() + "Z"
        }

        # Save to disk
        if save:
            with open(raw_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            self._log(f"  Saved: {raw_path}")

            # Also save a human-readable processed markdown version
            self._save_processed_md(result)

        return result

    def _save_processed_md(self, result: Dict[str, Any]):
        """Save a human-readable markdown version of the processed video."""
        video_id = result.get("video_id", "unknown")
        md_path = self.processed_dir / f"{video_id}.md"

        title = result.get("title", "Unknown Title")
        channel = result.get("channel", "Unknown Channel")
        date = result.get("upload_date_iso", result.get("upload_date", ""))
        duration = result.get("duration_formatted", "")
        views = result.get("view_count", 0)
        url = result.get("url", "")
        full_text = result.get("full_text", "")
        word_count = result.get("word_count", 0)

        md = f"""# {title}

**Channel**: {channel}
**Date**: {date}
**Duration**: {duration}
**Views**: {views:,}
**URL**: {url}
**Words**: {word_count:,}

---

## Transcript

{full_text}
"""
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(md)

    # ── Batch channel processing ───────────────────────────────────────────────

    def process_channel(
        self,
        channel: str,
        limit: int = 10,
        skip_if_exists: bool = True,
        channel_name: Optional[str] = None
    ) -> List[Dict[str, Any]]:
        """
        Process most recent N videos from a channel.
        Returns list of processed video results.
        """
        videos = self.get_channel_videos(channel, limit=limit)
        if not videos:
            self._log(f"No videos found for channel: {channel}")
            return []

        self._log(f"Processing {len(videos)} videos from {channel_name or channel} ...")
        results = []

        for i, vid in enumerate(videos, 1):
            self._log(f"[{i}/{len(videos)}] {vid.get('title', vid['video_id'])[:60]}")
            result = self.process_video(vid["video_id"], skip_if_exists=skip_if_exists)
            result["_channel_handle"] = channel
            result["_channel_name"] = channel_name
            results.append(result)

        success = sum(1 for r in results if not r.get("transcript_error") and r.get("full_text"))
        self._log(f"Channel processing complete: {success}/{len(results)} with transcripts")
        return results

    def process_playlist(self, playlist_url: str, limit: int = 50) -> List[Dict[str, Any]]:
        """Process all videos in a playlist."""
        videos = self.get_playlist_videos(playlist_url, limit=limit)
        results = []
        for i, vid in enumerate(videos, 1):
            self._log(f"[{i}/{len(videos)}] {vid.get('title', vid['video_id'])[:60]}")
            result = self.process_video(vid["video_id"])
            results.append(result)
        return results

    # ── Transcript chunking for LLM processing ────────────────────────────────

    def chunk_transcript(
        self,
        full_text: str,
        chunk_size: int = 3000,
        overlap: int = 200
    ) -> List[str]:
        """
        Split full transcript text into overlapping chunks for LLM processing.
        Splits on sentence boundaries where possible.

        Args:
            full_text: The full transcript text
            chunk_size: Target characters per chunk
            overlap: Overlap between chunks for context continuity

        Returns list of text chunks.
        """
        if len(full_text) <= chunk_size:
            return [full_text]

        chunks = []
        start = 0

        while start < len(full_text):
            end = min(start + chunk_size, len(full_text))

            # Try to find a sentence boundary near the end
            if end < len(full_text):
                # Look for '. ' or '? ' or '! ' near the end
                boundary = max(
                    full_text.rfind(". ", start, end),
                    full_text.rfind("? ", start, end),
                    full_text.rfind("! ", start, end),
                )
                if boundary > start + chunk_size // 2:
                    end = boundary + 2  # Include the period and space

            chunks.append(full_text[start:end].strip())
            start = max(start + 1, end - overlap)

        return chunks

    # ── Timestamp extraction ───────────────────────────────────────────────────

    def get_timestamped_segments(
        self,
        transcript: List[Dict],
        segment_minutes: int = 5
    ) -> List[Dict[str, Any]]:
        """
        Group transcript chunks into time segments.
        Returns list of {start_time, end_time, text, timestamp_label}
        """
        segments = []
        current_segment = {"chunks": [], "start": 0}
        segment_seconds = segment_minutes * 60

        for chunk in transcript:
            start = chunk.get("start", 0)

            if start - current_segment["start"] >= segment_seconds and current_segment["chunks"]:
                # Save current segment
                text = " ".join(c["text"] for c in current_segment["chunks"])
                segments.append({
                    "start_seconds": int(current_segment["start"]),
                    "end_seconds": int(start),
                    "timestamp_label": self._format_duration(int(current_segment["start"])),
                    "text": text.strip()
                })
                current_segment = {"chunks": [chunk], "start": start}
            else:
                current_segment["chunks"].append(chunk)

        # Add final segment
        if current_segment["chunks"]:
            text = " ".join(c["text"] for c in current_segment["chunks"])
            last_start = current_segment["chunks"][-1].get("start", 0)
            last_dur = current_segment["chunks"][-1].get("duration", 0)
            segments.append({
                "start_seconds": int(current_segment["start"]),
                "end_seconds": int(last_start + last_dur),
                "timestamp_label": self._format_duration(int(current_segment["start"])),
                "text": text.strip()
            })

        return segments

    # ── Stats ──────────────────────────────────────────────────────────────────

    def get_cache_stats(self) -> Dict[str, Any]:
        """Return stats about cached videos."""
        raw_files = list(self.raw_dir.glob("*.json"))
        processed_files = list(self.processed_dir.glob("*.md"))

        return {
            "cached_videos": len(raw_files),
            "processed_markdowns": len(processed_files),
            "raw_dir": str(self.raw_dir),
            "processed_dir": str(self.processed_dir),
            "video_ids": [f.stem for f in raw_files]
        }


# ── CLI entry point ────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description="YouTube Intelligence Engine — Genesis knowledge extraction",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python youtube_intelligence.py --video "https://youtu.be/abc123"
  python youtube_intelligence.py --video-id "abc123def456"
  python youtube_intelligence.py --channel "@NickPontesOfficial" --limit 5
  python youtube_intelligence.py --playlist "https://youtube.com/playlist?list=PLxxx" --limit 20
  python youtube_intelligence.py --stats
        """
    )
    parser.add_argument("--video", help="Single YouTube video URL")
    parser.add_argument("--video-id", help="Single YouTube video ID (11 chars)")
    parser.add_argument("--channel", help="YouTube channel handle or URL")
    parser.add_argument("--playlist", help="YouTube playlist URL")
    parser.add_argument("--limit", type=int, default=10, help="Max videos for channel/playlist (default: 10)")
    parser.add_argument("--no-cache", action="store_true", help="Re-fetch even if already cached")
    parser.add_argument("--stats", action="store_true", help="Show cache stats and exit")
    parser.add_argument("--output", help="Save results JSON to this file path")

    args = parser.parse_args()
    yt = YouTubeIntelligence()

    if args.stats:
        stats = yt.get_cache_stats()
        print(json.dumps(stats, indent=2))
        return

    skip = not args.no_cache
    results = []

    if args.video or args.video_id:
        url = args.video or args.video_id
        result = yt.process_video(url, skip_if_exists=skip)
        results = [result]
        if result.get("title"):
            print(f"\nTitle: {result['title']}")
        if result.get("word_count"):
            print(f"Words: {result['word_count']:,}")
        if result.get("full_text"):
            preview = result["full_text"][:300]
            print(f"\nTranscript preview:\n{preview}...")

    elif args.channel:
        results = yt.process_channel(args.channel, limit=args.limit, skip_if_exists=skip)
        print(f"\nProcessed {len(results)} videos from {args.channel}")

    elif args.playlist:
        results = yt.process_playlist(args.playlist, limit=args.limit)
        print(f"\nProcessed {len(results)} videos from playlist")

    else:
        parser.print_help()
        return

    if args.output and results:
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(results if len(results) > 1 else results[0], f, indent=2, ensure_ascii=False)
        print(f"\nSaved to: {args.output}")


if __name__ == "__main__":
    main()
