#!/usr/bin/env python3
"""
YouTube Scout Skill - Nightly Knowledge Harvester
==================================================
Scouts YouTube for breakthrough insights and converts them to actionable RWL tasks.

This skill:
1. Monitors 15+ AI/coding channels for new videos
2. Scores relevance to Genesis priorities
3. Extracts transcripts via Supadata API
4. Analyzes for actionable insights
5. Creates RWL tasks for implementation

Run nightly between 8pm-4am for maximum value extraction.

Usage:
    from skills.youtube_scout_skill import YouTubeScoutSkill

    scout = YouTubeScoutSkill()
    results = scout.run_nightly_scout()
"""

import json
import os
import re
import hashlib
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, field, asdict
import requests

# Paths
BASE_PATH = Path("/mnt/e/genesis-system")
CHANNELS_FILE = BASE_PATH / "data/youtube_scout_channels.json"
SCOUT_OUTPUT = BASE_PATH / "data/youtube_scout"
RWL_TASKS_FILE = BASE_PATH / "loop/youtube_insight_tasks.json"
SCOUT_LOG = BASE_PATH / "logs/youtube_scout.jsonl"


@dataclass
class VideoCandidate:
    """A video candidate for scouting."""
    video_id: str
    title: str
    channel_name: str
    channel_id: str
    published_at: str
    relevance_score: float = 0.0
    matched_keywords: List[str] = field(default_factory=list)
    priority: str = "medium"


@dataclass
class InsightExtraction:
    """Extracted insight from a video."""
    video_id: str
    video_title: str
    insight_type: str  # tool, technique, pattern, integration, architecture
    summary: str
    actionable_items: List[str]
    genesis_relevance: str
    implementation_complexity: str  # low, medium, high
    confidence: float
    source_quotes: List[str] = field(default_factory=list)


@dataclass
class RWLTask:
    """Task for Ralph Wiggum Loop execution."""
    id: str
    title: str
    description: str
    source_video: str
    source_channel: str
    insight_type: str
    priority: str
    estimated_complexity: str
    acceptance_criteria: List[str]
    created_at: str
    status: str = "pending"


class YouTubeScoutSkill:
    """
    Nightly YouTube Scout for Genesis knowledge harvesting.

    Discovers breakthrough techniques from top AI/coding channels
    and converts them into actionable RWL tasks.
    """

    def __init__(self,
                 supadata_api_key: Optional[str] = None,
                 youtube_api_key: Optional[str] = None):
        self.supadata_api_key = supadata_api_key or os.getenv("SUPADATA_API_KEY")
        self.youtube_api_key = youtube_api_key or os.getenv("YOUTUBE_API_KEY")

        # Load channel configuration
        self.channels = self._load_channels()

        # Ensure output directories exist
        SCOUT_OUTPUT.mkdir(parents=True, exist_ok=True)
        SCOUT_LOG.parent.mkdir(parents=True, exist_ok=True)

        # Stats tracking
        self.stats = {
            "videos_scanned": 0,
            "videos_relevant": 0,
            "transcripts_fetched": 0,
            "insights_extracted": 0,
            "tasks_created": 0
        }

    def _load_channels(self) -> Dict:
        """Load channel configuration."""
        if CHANNELS_FILE.exists():
            with open(CHANNELS_FILE) as f:
                return json.load(f)
        return {"channels": [], "search_queries": [], "relevance_keywords": []}

    def run_nightly_scout(self,
                          max_videos: int = 10,
                          hours_lookback: int = 48) -> Dict[str, Any]:
        """
        Run the complete nightly scout pipeline.

        Args:
            max_videos: Maximum videos to process (5-10 recommended)
            hours_lookback: How far back to look for new videos

        Returns:
            Summary of scout results
        """
        print("\n" + "=" * 70)
        print("YOUTUBE SCOUT - Nightly Knowledge Harvester")
        print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("=" * 70)

        # Phase 1: Discover relevant videos
        print("\n[Phase 1] Discovering relevant videos...")
        candidates = self._discover_videos(hours_lookback)
        print(f"  Found {len(candidates)} candidates")

        # Phase 2: Score and rank by relevance
        print("\n[Phase 2] Scoring relevance to Genesis...")
        ranked = self._rank_by_relevance(candidates)
        top_videos = ranked[:max_videos]
        print(f"  Top {len(top_videos)} selected for processing")

        # Phase 3: Fetch transcripts
        print("\n[Phase 3] Fetching transcripts via Supadata...")
        transcripts = self._fetch_transcripts(top_videos)
        print(f"  Retrieved {len(transcripts)} transcripts")

        # Phase 4: Extract insights
        print("\n[Phase 4] Extracting actionable insights...")
        insights = self._extract_insights(transcripts)
        print(f"  Extracted {len(insights)} insights")

        # Phase 5: Create RWL tasks
        print("\n[Phase 5] Creating RWL tasks...")
        tasks = self._create_rwl_tasks(insights)
        print(f"  Created {len(tasks)} tasks")

        # Save results
        self._save_results(top_videos, transcripts, insights, tasks)

        # Log scout run
        self._log_scout_run()

        print("\n" + "=" * 70)
        print("SCOUT COMPLETE")
        print(f"  Videos scanned: {self.stats['videos_scanned']}")
        print(f"  Relevant videos: {self.stats['videos_relevant']}")
        print(f"  Insights extracted: {self.stats['insights_extracted']}")
        print(f"  RWL tasks created: {self.stats['tasks_created']}")
        print("=" * 70)

        return {
            "timestamp": datetime.now().isoformat(),
            "stats": self.stats,
            "top_videos": [asdict(v) for v in top_videos],
            "insights": [asdict(i) for i in insights],
            "tasks_created": len(tasks)
        }

    def _discover_videos(self, hours_lookback: int) -> List[VideoCandidate]:
        """
        Discover recent videos from monitored channels.

        Uses YouTube RSS feeds (no API key needed) or YouTube Data API.
        """
        candidates = []
        cutoff = datetime.now() - timedelta(hours=hours_lookback)

        for channel in self.channels.get("channels", []):
            channel_id = channel.get("id")
            channel_name = channel.get("name")
            priority = channel.get("priority", "medium")

            # Skip channels with pending lookup IDs
            if not channel_id or channel_id == "PENDING_LOOKUP":
                print(f"    Skipping {channel_name}: ID not yet configured")
                continue

            try:
                # Try RSS feed first (free, no API key)
                videos = self._fetch_channel_rss(channel_id, cutoff)

                for video in videos:
                    candidates.append(VideoCandidate(
                        video_id=video["video_id"],
                        title=video["title"],
                        channel_name=channel_name,
                        channel_id=channel_id,
                        published_at=video["published"],
                        priority=priority
                    ))
                    self.stats["videos_scanned"] += 1

            except Exception as e:
                print(f"    Warning: Could not fetch {channel_name}: {e}")
                continue

        return candidates

    def _fetch_channel_rss(self, channel_id: str,
                           cutoff: datetime) -> List[Dict]:
        """Fetch recent videos from channel RSS feed."""
        import xml.etree.ElementTree as ET

        rss_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"

        try:
            response = requests.get(rss_url, timeout=10)
            response.raise_for_status()

            root = ET.fromstring(response.content)
            ns = {"atom": "http://www.w3.org/2005/Atom",
                  "yt": "http://www.youtube.com/xml/schemas/2015"}

            videos = []
            for entry in root.findall("atom:entry", ns):
                video_id = entry.find("yt:videoId", ns)
                title = entry.find("atom:title", ns)
                published = entry.find("atom:published", ns)

                if video_id is not None and title is not None:
                    pub_date = datetime.fromisoformat(
                        published.text.replace("Z", "+00:00")
                    ).replace(tzinfo=None)

                    if pub_date > cutoff:
                        videos.append({
                            "video_id": video_id.text,
                            "title": title.text,
                            "published": published.text
                        })

            return videos

        except Exception as e:
            raise Exception(f"RSS fetch failed: {e}")

    def _rank_by_relevance(self,
                           candidates: List[VideoCandidate]) -> List[VideoCandidate]:
        """Score and rank videos by Genesis relevance."""
        keywords = self.channels.get("relevance_keywords", [])

        for candidate in candidates:
            score = 0.0
            matched = []
            title_lower = candidate.title.lower()

            # Keyword matching
            for kw in keywords:
                if kw.lower() in title_lower:
                    score += 0.15
                    matched.append(kw)

            # Priority bonus
            priority_bonus = {"critical": 0.3, "high": 0.2, "medium": 0.1, "low": 0.0}
            score += priority_bonus.get(candidate.priority, 0.1)

            # High-value title patterns
            high_value_patterns = [
                r"tutorial", r"how to", r"guide", r"setup",
                r"breakthrough", r"game.?changer", r"secret",
                r"new feature", r"update", r"release"
            ]
            for pattern in high_value_patterns:
                if re.search(pattern, title_lower):
                    score += 0.1

            candidate.relevance_score = min(score, 1.0)
            candidate.matched_keywords = matched

            if score >= 0.3:
                self.stats["videos_relevant"] += 1

        # Sort by relevance score
        candidates.sort(key=lambda x: x.relevance_score, reverse=True)
        return candidates

    def _fetch_transcripts(self,
                           videos: List[VideoCandidate]) -> Dict[str, Dict]:
        """Fetch transcripts using Supadata API."""
        transcripts = {}

        for video in videos:
            try:
                transcript = self._fetch_supadata_transcript(video.video_id)
                if transcript:
                    transcripts[video.video_id] = {
                        "video": asdict(video),
                        "transcript": transcript,
                        "word_count": len(transcript.split())
                    }
                    self.stats["transcripts_fetched"] += 1
                    print(f"    Got: {video.title[:50]}... ({len(transcript.split())} words)")
            except Exception as e:
                print(f"    Failed: {video.video_id} - {e}")
                continue

        return transcripts

    def _fetch_supadata_transcript(self, video_id: str) -> Optional[str]:
        """Fetch transcript using Supadata API."""
        if not self.supadata_api_key:
            raise Exception("Supadata API key not configured")

        url = "https://api.supadata.ai/v1/youtube/transcript"
        headers = {"x-api-key": self.supadata_api_key}
        params = {"videoId": video_id}

        response = requests.get(url, headers=headers, params=params, timeout=120)
        response.raise_for_status()

        result = response.json()

        if "content" in result and isinstance(result["content"], list):
            texts = [seg.get("text", "") for seg in result["content"]]
            return " ".join(texts)
        elif "transcript" in result:
            return result["transcript"]
        elif isinstance(result, list):
            return " ".join(seg.get("text", "") for seg in result)

        return None

    def _extract_insights(self,
                          transcripts: Dict[str, Dict]) -> List[InsightExtraction]:
        """
        Extract actionable insights from transcripts.

        Uses pattern matching and heuristics. For production,
        integrate with LLM for deeper analysis.
        """
        insights = []

        # Insight patterns to detect
        insight_patterns = {
            "tool": [
                r"(?:new|just released|announcing|introducing)\s+(\w+(?:\s+\w+)?)",
                r"(?:tool called|using|try)\s+(\w+(?:\.\w+)?)",
                r"(?:mcp server|claude code|gemini)\s+(?:for|with)\s+(\w+)"
            ],
            "technique": [
                r"(?:technique|approach|method|pattern)\s+(?:called|named|is)\s+(\w+(?:\s+\w+)?)",
                r"(?:key insight|trick|secret)\s+(?:is|to)\s+(.{20,80})",
                r"(?:best practice|pro tip)\s*[:\-]\s*(.{20,80})"
            ],
            "integration": [
                r"(?:integrate|connect|combine)\s+(\w+)\s+(?:with|and)\s+(\w+)",
                r"(?:api|mcp|workflow)\s+(?:for|to)\s+(\w+)"
            ],
            "architecture": [
                r"(?:architecture|system|framework)\s+(?:for|to)\s+(.{20,80})",
                r"(?:multi-agent|orchestration|pipeline)\s+(.{20,60})"
            ]
        }

        for video_id, data in transcripts.items():
            transcript = data["transcript"]
            video = data["video"]

            # Look for insights
            for insight_type, patterns in insight_patterns.items():
                for pattern in patterns:
                    matches = re.findall(pattern, transcript, re.IGNORECASE)
                    for match in matches[:2]:  # Max 2 per pattern
                        if isinstance(match, tuple):
                            match = " ".join(match)

                        if len(match) > 10:  # Filter noise
                            insight = InsightExtraction(
                                video_id=video_id,
                                video_title=video["title"],
                                insight_type=insight_type,
                                summary=match[:200],
                                actionable_items=self._generate_actionable_items(
                                    insight_type, match
                                ),
                                genesis_relevance=self._assess_genesis_relevance(match),
                                implementation_complexity=self._assess_complexity(match),
                                confidence=0.7,
                                source_quotes=[match[:300]]
                            )
                            insights.append(insight)
                            self.stats["insights_extracted"] += 1

        return insights

    def _generate_actionable_items(self, insight_type: str,
                                    content: str) -> List[str]:
        """Generate actionable items from insight."""
        items = []

        if insight_type == "tool":
            items = [
                f"Research {content} capabilities and pricing",
                f"Test {content} integration with Genesis stack",
                f"Document {content} usage patterns"
            ]
        elif insight_type == "technique":
            items = [
                f"Implement technique: {content[:50]}",
                "Test in controlled environment",
                "Measure improvement vs baseline"
            ]
        elif insight_type == "integration":
            items = [
                f"Design integration architecture for {content[:50]}",
                "Create MCP server or skill wrapper",
                "Add to Genesis capability matrix"
            ]
        elif insight_type == "architecture":
            items = [
                f"Evaluate architecture: {content[:50]}",
                "Compare with existing Genesis patterns",
                "Plan incremental adoption if beneficial"
            ]

        return items

    def _assess_genesis_relevance(self, content: str) -> str:
        """Assess relevance to Genesis system."""
        content_lower = content.lower()

        high_relevance = ["claude", "gemini", "mcp", "agent", "autonomous",
                         "voice ai", "memory", "orchestration"]
        medium_relevance = ["api", "automation", "workflow", "n8n", "coding"]

        if any(kw in content_lower for kw in high_relevance):
            return "high"
        elif any(kw in content_lower for kw in medium_relevance):
            return "medium"
        return "low"

    def _assess_complexity(self, content: str) -> str:
        """Assess implementation complexity."""
        content_lower = content.lower()

        high_complexity = ["architecture", "framework", "system", "multi-agent"]
        low_complexity = ["tool", "api", "simple", "quick"]

        if any(kw in content_lower for kw in high_complexity):
            return "high"
        elif any(kw in content_lower for kw in low_complexity):
            return "low"
        return "medium"

    def _create_rwl_tasks(self,
                          insights: List[InsightExtraction]) -> List[RWLTask]:
        """Create RWL tasks from insights."""
        tasks = []

        for insight in insights:
            if insight.genesis_relevance in ["high", "medium"]:
                task_id = hashlib.md5(
                    f"{insight.video_id}_{insight.summary[:50]}".encode()
                ).hexdigest()[:12]

                task = RWLTask(
                    id=f"yt_{task_id}",
                    title=f"Implement: {insight.summary[:60]}",
                    description=f"Source: {insight.video_title}\n\n"
                               f"Insight: {insight.summary}\n\n"
                               f"Type: {insight.insight_type}",
                    source_video=f"https://youtube.com/watch?v={insight.video_id}",
                    source_channel=insight.video_title,
                    insight_type=insight.insight_type,
                    priority="high" if insight.genesis_relevance == "high" else "medium",
                    estimated_complexity=insight.implementation_complexity,
                    acceptance_criteria=insight.actionable_items,
                    created_at=datetime.now().isoformat(),
                    status="pending"
                )
                tasks.append(task)
                self.stats["tasks_created"] += 1

        return tasks

    def _save_results(self, videos: List[VideoCandidate],
                      transcripts: Dict, insights: List[InsightExtraction],
                      tasks: List[RWLTask]) -> None:
        """Save all scout results."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save transcripts
        for video_id, data in transcripts.items():
            output_file = SCOUT_OUTPUT / f"{video_id}.json"
            with open(output_file, 'w') as f:
                json.dump(data, f, indent=2)

        # Save insights
        insights_file = SCOUT_OUTPUT / f"insights_{timestamp}.json"
        with open(insights_file, 'w') as f:
            json.dump([asdict(i) for i in insights], f, indent=2)

        # Save/append RWL tasks
        existing_tasks = []
        if RWL_TASKS_FILE.exists():
            with open(RWL_TASKS_FILE) as f:
                existing_tasks = json.load(f).get("tasks", [])

        # Dedupe by task ID
        existing_ids = {t.get("id") for t in existing_tasks}
        new_tasks = [asdict(t) for t in tasks if t.id not in existing_ids]

        all_tasks = existing_tasks + new_tasks
        with open(RWL_TASKS_FILE, 'w') as f:
            json.dump({
                "generated_at": datetime.now().isoformat(),
                "total_tasks": len(all_tasks),
                "pending": len([t for t in all_tasks if t.get("status") == "pending"]),
                "tasks": all_tasks
            }, f, indent=2)

    def _log_scout_run(self) -> None:
        """Log scout run to JSONL."""
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "stats": self.stats
        }

        with open(SCOUT_LOG, 'a') as f:
            f.write(json.dumps(log_entry) + "\n")


def main():
    """Run nightly scout."""
    scout = YouTubeScoutSkill()
    results = scout.run_nightly_scout(max_videos=10, hours_lookback=48)
    return results


if __name__ == "__main__":
    main()
