#!/usr/bin/env python3
"""
Genesis YouTube Learner
========================
Fetches transcripts from YouTube videos to learn about new AI tools.

Usage:
    python youtube_learner.py transcript "https://youtube.com/watch?v=..."
    python youtube_learner.py search "Claude Code MCP servers"
    python youtube_learner.py learn-channel "UCxxxxxx"
"""

import sys
import json
import re
from datetime import datetime
from typing import List, Dict, Optional
from pathlib import Path

try:
    from youtube_transcript_api import YouTubeTranscriptApi
    from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
except ImportError:
    print("Install: pip install youtube-transcript-api")
    sys.exit(1)


def extract_video_id(url: str) -> Optional[str]:
    """Extract video ID from various YouTube URL formats."""
    patterns = [
        r'(?:v=|/v/|youtu\.be/|/embed/)([a-zA-Z0-9_-]{11})',
        r'^([a-zA-Z0-9_-]{11})$'  # Direct video ID
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None


def get_transcript(video_url: str, languages: List[str] = None) -> Dict:
    """
    Fetch transcript for a YouTube video.

    Args:
        video_url: YouTube URL or video ID
        languages: Preferred languages (default: ['en'])

    Returns:
        Dict with transcript text and metadata
    """
    if languages is None:
        languages = ['en', 'en-US', 'en-GB']

    video_id = extract_video_id(video_url)
    if not video_id:
        return {"error": f"Could not extract video ID from: {video_url}"}

    try:
        # New API uses instance method
        api = YouTubeTranscriptApi()
        transcript_data = api.fetch(video_id)

        # Combine into full text
        full_text = " ".join([entry.text for entry in transcript_data])

        # Also create timestamped version
        timestamped = []
        for entry in transcript_data:
            minutes = int(entry.start // 60)
            seconds = int(entry.start % 60)
            timestamped.append(f"[{minutes:02d}:{seconds:02d}] {entry.text}")

        return {
            "video_id": video_id,
            "language": "en",  # Default assumption
            "is_generated": False,
            "full_text": full_text,
            "timestamped": "\n".join(timestamped),
            "word_count": len(full_text.split()),
            "duration_seconds": transcript_data[-1].start + transcript_data[-1].duration if transcript_data else 0
        }

    except TranscriptsDisabled:
        return {"error": "Transcripts are disabled for this video"}
    except NoTranscriptFound:
        return {"error": "No transcript found for this video"}
    except Exception as e:
        return {"error": str(e)}


def summarize_for_learning(transcript: Dict, topic: str = "AI tools") -> Dict:
    """
    Prepare transcript for Genesis learning.
    Extracts key information relevant to AI/development tools.
    """
    if "error" in transcript:
        return transcript

    text = transcript["full_text"].lower()

    # Keywords that indicate useful AI tool information
    tool_indicators = [
        "mcp", "model context protocol", "claude", "anthropic",
        "openai", "gpt", "llm", "api", "sdk", "framework",
        "agent", "autonomous", "automation", "integration",
        "docker", "kubernetes", "deployment", "server",
        "memory", "context", "embedding", "vector",
        "github", "repository", "package", "install"
    ]

    # Score relevance
    relevance_score = sum(1 for kw in tool_indicators if kw in text)

    # Extract sentences mentioning tools (simple extraction)
    sentences = transcript["full_text"].replace(".", ".\n").split("\n")
    tool_mentions = []
    for sentence in sentences:
        sentence_lower = sentence.lower()
        if any(kw in sentence_lower for kw in tool_indicators):
            tool_mentions.append(sentence.strip())

    return {
        "video_id": transcript["video_id"],
        "word_count": transcript["word_count"],
        "duration_seconds": transcript["duration_seconds"],
        "relevance_score": relevance_score,
        "is_relevant": relevance_score >= 3,
        "tool_mentions_count": len(tool_mentions),
        "key_excerpts": tool_mentions[:20],  # Top 20 relevant sentences
        "full_transcript_available": True
    }


def save_learning(video_id: str, content: Dict, output_dir: str = None):
    """Save learned content to Genesis YouTube Knowledge Base."""
    kb_dir = Path("e:/genesis-system/youtube_knowledge_base")
    kb_dir.mkdir(parents=True, exist_ok=True)

    # Save transcript
    transcript_dir = kb_dir / "transcripts"
    transcript_dir.mkdir(exist_ok=True)
    transcript_file = transcript_dir / f"{video_id}.json"
    with open(transcript_file, 'w') as f:
        json.dump(content, f, indent=2)

    # Save learning analysis
    learning_dir = kb_dir / "learnings"
    learning_dir.mkdir(exist_ok=True)
    learning_file = learning_dir / f"{video_id}_learning.json"
    learning_data = {
        "video_id": video_id,
        "analyzed": datetime.now().isoformat(),
        "relevance_score": content.get("relevance_score", 0),
        "is_relevant": content.get("is_relevant", False),
        "key_excerpts": content.get("key_excerpts", []),
        "topics_detected": detect_topics(content.get("full_transcript", ""))
    }
    with open(learning_file, 'w') as f:
        json.dump(learning_data, f, indent=2)

    # Update index
    update_knowledge_base_index(video_id, content, learning_data)

    return str(transcript_file)


def detect_topics(text: str) -> List[str]:
    """Detect which topics are present in the transcript."""
    text_lower = text.lower()
    topics = []
    topic_keywords = {
        "ai_agents": ["agent", "autonomous", "agentic"],
        "mcp_servers": ["mcp", "model context protocol", "server"],
        "claude_code": ["claude code", "claude-code", "anthropic"],
        "memory_systems": ["memory", "remember", "context", "knowledge"],
        "autonomous_systems": ["autonomous", "self-improving", "learning"]
    }
    for topic, keywords in topic_keywords.items():
        if any(kw in text_lower for kw in keywords):
            topics.append(topic)
    return topics


def update_knowledge_base_index(video_id: str, content: Dict, learning: Dict):
    """Update the knowledge base master index."""
    index_path = Path("e:/genesis-system/youtube_knowledge_base/index.json")

    try:
        with open(index_path, 'r') as f:
            index = json.load(f)
    except:
        index = {"videos": [], "topics": {}, "stats": {}}

    # Add video entry
    video_entry = {
        "video_id": video_id,
        "added": datetime.now().isoformat(),
        "word_count": content.get("word_count", 0),
        "relevance_score": learning.get("relevance_score", 0),
        "topics": learning.get("topics_detected", [])
    }

    # Remove duplicate if exists
    index["videos"] = [v for v in index.get("videos", []) if v["video_id"] != video_id]
    index["videos"].append(video_entry)

    # Update topic indexes
    for topic in learning.get("topics_detected", []):
        if topic not in index.get("topics", {}):
            index["topics"][topic] = []
        if video_id not in index["topics"][topic]:
            index["topics"][topic].append(video_id)

    # Update stats
    index["stats"] = {
        "total_videos": len(index["videos"]),
        "total_words": sum(v.get("word_count", 0) for v in index["videos"]),
        "last_updated": datetime.now().isoformat()
    }

    with open(index_path, 'w') as f:
        json.dump(index, f, indent=2)


# CLI Interface
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("""
Genesis YouTube Learner
=======================

Commands:
  transcript <url>     Get full transcript
  learn <url>          Get transcript + analyze for AI tools

Examples:
  python youtube_learner.py transcript "https://youtube.com/watch?v=xxxxx"
  python youtube_learner.py learn "https://youtube.com/watch?v=xxxxx"
        """)
        sys.exit(0)

    command = sys.argv[1]

    if command == "transcript" and len(sys.argv) > 2:
        url = sys.argv[2]
        result = get_transcript(url)
        if "error" in result:
            print(f"Error: {result['error']}")
        else:
            print(f"Video ID: {result['video_id']}")
            print(f"Language: {result['language']}")
            print(f"Words: {result['word_count']}")
            print(f"Duration: {result['duration_seconds']:.0f}s")
            print("\n--- TRANSCRIPT ---\n")
            print(result['full_text'][:5000])
            if len(result['full_text']) > 5000:
                print(f"\n... [{len(result['full_text']) - 5000} more characters]")

    elif command == "learn" and len(sys.argv) > 2:
        url = sys.argv[2]
        print(f"Fetching transcript from: {url}")
        transcript = get_transcript(url)

        if "error" in transcript:
            print(f"Error: {transcript['error']}")
        else:
            analysis = summarize_for_learning(transcript)
            print(f"\nVideo ID: {analysis['video_id']}")
            print(f"Relevance Score: {analysis['relevance_score']}")
            print(f"Is Relevant to AI Tools: {analysis['is_relevant']}")
            print(f"Tool Mentions: {analysis['tool_mentions_count']}")

            if analysis['key_excerpts']:
                print("\n--- KEY EXCERPTS ---\n")
                for i, excerpt in enumerate(analysis['key_excerpts'][:10], 1):
                    print(f"{i}. {excerpt[:200]}...")

            # Save for later
            saved_path = save_learning(
                analysis['video_id'],
                {**analysis, "full_transcript": transcript['full_text']}
            )
            print(f"\nSaved to: {saved_path}")

    else:
        print(f"Unknown command: {command}")
        sys.exit(1)
