#!/usr/bin/env python3
"""
Video Entity Extractor for Knowledge Graph

Extracts structured entities from video transcripts for ingestion
into the Genesis knowledge graph.

Entity Types:
- person: Named individuals mentioned
- organization: Companies, groups, institutions
- product: Products, services, platforms
- technology: Technologies, frameworks, tools
- concept: Ideas, methodologies, patterns
- action_item: Actionable advice or steps
- quote: Notable quotes or phrases
- metric: Statistics, numbers, benchmarks

Usage:
    extractor = VideoEntityExtractor()
    entities = extractor.extract(transcript_text, video_metadata)

Author: Genesis System
Version: 1.0.0
"""

import re
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional, Set, Tuple
from dataclasses import dataclass, field, asdict
from collections import Counter
import sys

sys.path.insert(0, '/mnt/e/genesis-system')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

GENESIS_ROOT = Path("/mnt/e/genesis-system")


@dataclass
class ExtractedEntity:
    """An entity extracted from video transcript."""
    entity_type: str
    value: str
    confidence: float  # 0.0 to 1.0
    mentions: int = 1
    timestamp_refs: List[float] = field(default_factory=list)
    context: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class ExtractedInsight:
    """An insight extracted from video transcript."""
    insight_type: str
    content: str
    score: float  # Relevance/value score 0.0 to 1.0
    source_text: Optional[str] = None
    timestamp_ref: Optional[float] = None
    metadata: Dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class ExtractionResult:
    """Complete extraction result from a video."""
    video_id: str
    entities: List[ExtractedEntity]
    insights: List[ExtractedInsight]
    topics: List[str]
    key_phrases: List[str]
    summary: str
    extracted_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")

    def to_dict(self) -> Dict[str, Any]:
        return {
            "video_id": self.video_id,
            "entities": [e.to_dict() for e in self.entities],
            "insights": [i.to_dict() for i in self.insights],
            "topics": self.topics,
            "key_phrases": self.key_phrases,
            "summary": self.summary,
            "extracted_at": self.extracted_at
        }

    def to_jsonl(self) -> str:
        """Convert to JSONL format for knowledge graph ingestion."""
        lines = []

        # Entity lines
        for entity in self.entities:
            lines.append(json.dumps({
                "type": "entity",
                "video_id": self.video_id,
                **entity.to_dict()
            }))

        # Insight lines
        for insight in self.insights:
            lines.append(json.dumps({
                "type": "insight",
                "video_id": self.video_id,
                **insight.to_dict()
            }))

        return "\n".join(lines)


class VideoEntityExtractor:
    """Extracts entities and insights from video transcripts."""

    # Technology/tool patterns
    TECH_PATTERNS = {
        # AI/ML
        r'\b(claude|gpt-?4|gemini|llama|mistral|anthropic|openai)\b': 'technology',
        r'\b(langchain|llamaindex|autogen|crewai|semantic kernel)\b': 'technology',
        r'\b(rag|retrieval augmented|vector database|embedding)\b': 'concept',

        # Platforms
        r'\b(github|gitlab|vercel|railway|aws|gcp|azure)\b': 'technology',
        r'\b(supabase|firebase|postgresql|mongodb|redis)\b': 'technology',
        r'\b(n8n|zapier|make\.com|pipedream)\b': 'technology',

        # Frameworks
        r'\b(react|nextjs|vue|angular|svelte)\b': 'technology',
        r'\b(fastapi|flask|django|express)\b': 'technology',
        r'\b(typescript|python|javascript|rust|go)\b': 'technology',

        # Business tools
        r'\b(stripe|twilio|sendgrid|mailchimp)\b': 'product',
        r'\b(notion|slack|discord|linear)\b': 'product',
        r'\b(gohighlevel|highlevel|ghl)\b': 'product',
    }

    # Action item patterns
    ACTION_PATTERNS = [
        r"(?:you should|you need to|make sure to|don't forget to)\s+([^.!?]+)",
        r"(?:step \d+[:\s]+)([^.!?]+)",
        r"(?:first|second|third|then|next|finally)[,:\s]+([^.!?]+)",
        r"(?:the key is to|important to|critical to)\s+([^.!?]+)",
        r"(?:always|never)\s+([^.!?]+)",
    ]

    # Insight type patterns
    INSIGHT_PATTERNS = {
        "key_point": [
            r"(?:the main|the key|the important|crucial)\s+(?:thing|point|takeaway)\s+(?:is|here)\s+([^.!?]+)",
            r"(?:remember|note that|keep in mind)\s+([^.!?]+)",
        ],
        "recommendation": [
            r"(?:i recommend|i suggest|my recommendation)\s+([^.!?]+)",
            r"(?:the best way|the easiest way|the fastest way)\s+(?:to|is)\s+([^.!?]+)",
        ],
        "warning": [
            r"(?:be careful|watch out|avoid|don't)\s+([^.!?]+)",
            r"(?:common mistake|biggest mistake|pitfall)\s+(?:is|here)\s+([^.!?]+)",
        ],
        "trend": [
            r"(?:trending|becoming popular|on the rise|growing)\s+([^.!?]+)",
            r"(?:in 202\d|this year|next year)\s+([^.!?]+)",
        ],
        "statistic": [
            r"(\d+(?:\.\d+)?%?\s+(?:of|percent|times|faster|slower|more|less))[^.!?]*",
            r"(?:on average|typically|usually)\s+([^.!?]+)",
        ],
    }

    def __init__(self):
        self._compile_patterns()

    def _compile_patterns(self):
        """Pre-compile regex patterns for efficiency."""
        self.tech_patterns = {
            re.compile(pattern, re.IGNORECASE): entity_type
            for pattern, entity_type in self.TECH_PATTERNS.items()
        }

        self.action_patterns = [
            re.compile(pattern, re.IGNORECASE)
            for pattern in self.ACTION_PATTERNS
        ]

        self.insight_patterns = {
            insight_type: [re.compile(p, re.IGNORECASE) for p in patterns]
            for insight_type, patterns in self.INSIGHT_PATTERNS.items()
        }

    def extract(
        self,
        transcript_text: str,
        video_metadata: Optional[Dict[str, Any]] = None
    ) -> ExtractionResult:
        """
        Extract all entities and insights from transcript.

        Args:
            transcript_text: Full transcript text
            video_metadata: Optional metadata (video_id, title, channel, etc.)

        Returns:
            ExtractionResult with entities, insights, topics, and summary
        """
        video_metadata = video_metadata or {}
        video_id = video_metadata.get("video_id", "unknown")

        # Extract different entity types
        entities = []
        entities.extend(self._extract_technologies(transcript_text))
        entities.extend(self._extract_people(transcript_text))
        entities.extend(self._extract_organizations(transcript_text))
        entities.extend(self._extract_action_items(transcript_text))

        # Extract insights
        insights = self._extract_insights(transcript_text)

        # Extract topics
        topics = self._extract_topics(transcript_text)

        # Extract key phrases
        key_phrases = self._extract_key_phrases(transcript_text)

        # Generate summary
        summary = self._generate_summary(transcript_text)

        return ExtractionResult(
            video_id=video_id,
            entities=entities,
            insights=insights,
            topics=topics,
            key_phrases=key_phrases,
            summary=summary
        )

    def _extract_technologies(self, text: str) -> List[ExtractedEntity]:
        """Extract technology/tool mentions."""
        entities = []
        text_lower = text.lower()

        for pattern, entity_type in self.tech_patterns.items():
            matches = pattern.findall(text)
            if matches:
                # Count mentions
                for match in set(matches):
                    match_lower = match.lower()
                    count = text_lower.count(match_lower)
                    confidence = min(0.5 + (count * 0.1), 1.0)

                    entities.append(ExtractedEntity(
                        entity_type=entity_type,
                        value=match,
                        confidence=confidence,
                        mentions=count
                    ))

        return entities

    def _extract_people(self, text: str) -> List[ExtractedEntity]:
        """Extract person names."""
        entities = []

        # Pattern for proper names (two capitalized words)
        pattern = r'\b([A-Z][a-z]+\s+[A-Z][a-z]+)\b'
        matches = re.findall(pattern, text)

        # Filter false positives
        false_positives = {
            'New York', 'San Francisco', 'Los Angeles', 'United States',
            'Open Source', 'Machine Learning', 'Deep Learning',
            'Natural Language', 'Computer Vision'
        }

        name_counts = Counter(matches)
        for name, count in name_counts.items():
            if name not in false_positives and count >= 2:
                entities.append(ExtractedEntity(
                    entity_type="person",
                    value=name,
                    confidence=min(0.4 + (count * 0.1), 0.9),
                    mentions=count
                ))

        return entities

    def _extract_organizations(self, text: str) -> List[ExtractedEntity]:
        """Extract organization mentions."""
        entities = []

        # Known organizations
        orgs = {
            'google', 'microsoft', 'amazon', 'apple', 'meta', 'facebook',
            'anthropic', 'openai', 'deepmind', 'nvidia', 'tesla',
            'y combinator', 'sequoia', 'andreessen horowitz', 'a16z',
            'salesforce', 'oracle', 'ibm', 'intel', 'amd'
        }

        text_lower = text.lower()
        for org in orgs:
            count = text_lower.count(org)
            if count > 0:
                entities.append(ExtractedEntity(
                    entity_type="organization",
                    value=org.title(),
                    confidence=0.9,
                    mentions=count
                ))

        return entities

    def _extract_action_items(self, text: str) -> List[ExtractedEntity]:
        """Extract actionable advice."""
        entities = []

        for pattern in self.action_patterns:
            matches = pattern.findall(text)
            for match in matches[:5]:  # Limit to 5 per pattern
                if len(match) > 10 and len(match) < 200:
                    entities.append(ExtractedEntity(
                        entity_type="action_item",
                        value=match.strip(),
                        confidence=0.7
                    ))

        return entities

    def _extract_insights(self, text: str) -> List[ExtractedInsight]:
        """Extract insights by type."""
        insights = []

        for insight_type, patterns in self.insight_patterns.items():
            for pattern in patterns:
                matches = pattern.findall(text)
                for match in matches[:3]:  # Limit per type
                    if len(match) > 10:
                        insights.append(ExtractedInsight(
                            insight_type=insight_type,
                            content=match.strip(),
                            score=0.6
                        ))

        return insights

    def _extract_topics(self, text: str) -> List[str]:
        """Extract main topics from text."""
        topic_keywords = {
            'ai', 'artificial intelligence', 'machine learning', 'deep learning',
            'llm', 'large language model', 'agent', 'automation',
            'saas', 'startup', 'revenue', 'growth', 'marketing', 'sales',
            'api', 'integration', 'workflow', 'pipeline',
            'voice', 'audio', 'video', 'content',
            'database', 'vector', 'embedding', 'rag',
            'security', 'authentication', 'deployment', 'scaling'
        }

        text_lower = text.lower()
        found_topics = []

        for keyword in topic_keywords:
            if keyword in text_lower:
                count = text_lower.count(keyword)
                found_topics.append((keyword, count))

        # Sort by frequency and return top 10
        found_topics.sort(key=lambda x: x[1], reverse=True)
        return [topic for topic, _ in found_topics[:10]]

    def _extract_key_phrases(self, text: str) -> List[str]:
        """Extract key phrases (n-grams)."""
        # Clean and tokenize
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())

        # Generate bigrams
        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]

        # Count and filter
        phrase_counts = Counter(bigrams)

        # Filter common phrases
        stopword_phrases = {
            'the the', 'and the', 'to the', 'of the', 'in the',
            'it is', 'this is', 'that is', 'there is', 'they are'
        }

        top_phrases = [
            phrase for phrase, count in phrase_counts.most_common(30)
            if phrase not in stopword_phrases and count > 2
        ]

        return top_phrases[:15]

    def _generate_summary(self, text: str, max_sentences: int = 3) -> str:
        """Generate extractive summary."""
        # Split into sentences
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]

        if not sentences:
            return "No summary available."

        # Score sentences by position and content
        scored = []
        keywords = set(self._extract_topics(text))

        for i, sentence in enumerate(sentences):
            # Position score (first sentences are important)
            position_score = 1.0 / (i + 1)

            # Keyword score
            words_lower = sentence.lower()
            keyword_score = sum(1 for kw in keywords if kw in words_lower)

            # Length score (prefer medium-length sentences)
            length = len(sentence.split())
            length_score = 1.0 if 10 <= length <= 30 else 0.5

            total_score = position_score + (keyword_score * 0.3) + length_score
            scored.append((sentence, total_score))

        # Sort by score and take top sentences
        scored.sort(key=lambda x: x[1], reverse=True)
        summary_sentences = [s for s, _ in scored[:max_sentences]]

        return '. '.join(summary_sentences) + '.'


def save_extraction_results(
    result: ExtractionResult,
    output_dir: Optional[Path] = None
) -> Path:
    """Save extraction results to files."""
    output_dir = output_dir or GENESIS_ROOT / "data" / "youtube" / "extractions"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save as JSON
    json_path = output_dir / f"{result.video_id}_entities.json"
    with open(json_path, 'w') as f:
        json.dump(result.to_dict(), f, indent=2)

    # Save as JSONL for knowledge graph
    jsonl_path = output_dir / f"{result.video_id}_entities.jsonl"
    with open(jsonl_path, 'w') as f:
        f.write(result.to_jsonl())

    logger.info(f"Saved extraction results to {output_dir}")
    return json_path


async def main():
    """CLI for entity extraction."""
    import argparse

    parser = argparse.ArgumentParser(description="Extract entities from video transcript")
    parser.add_argument("--text", help="Transcript text to process")
    parser.add_argument("--file", type=Path, help="File containing transcript")
    parser.add_argument("--video-id", default="test", help="Video ID")
    parser.add_argument("--output", type=Path, help="Output directory")

    args = parser.parse_args()

    if args.file:
        text = args.file.read_text()
    elif args.text:
        text = args.text
    else:
        # Demo text
        text = """
        Welcome to this tutorial on building AI agents with Claude and LangChain.
        I'm going to show you how to create a powerful automation system.
        The key thing to remember is that you should always test your prompts carefully.
        We'll be using Python and FastAPI for the backend.
        Companies like Anthropic and OpenAI are leading the way in this space.
        Step 1: Set up your development environment.
        Step 2: Install the required dependencies.
        On average, these systems can handle 100 requests per second.
        The biggest mistake people make is not handling errors properly.
        I recommend using structured outputs for better reliability.
        """

    extractor = VideoEntityExtractor()
    result = extractor.extract(text, {"video_id": args.video_id})

    print(f"\n{'='*60}")
    print(f"Entity Extraction Results")
    print(f"{'='*60}")
    print(f"Video ID: {result.video_id}")
    print(f"Entities: {len(result.entities)}")
    print(f"Insights: {len(result.insights)}")
    print(f"Topics: {', '.join(result.topics[:5])}")

    print(f"\nTop Entities:")
    for e in result.entities[:5]:
        print(f"  [{e.entity_type}] {e.value} (conf: {e.confidence:.2f})")

    print(f"\nInsights:")
    for i in result.insights[:3]:
        print(f"  [{i.insight_type}] {i.content[:60]}...")

    print(f"\nSummary:")
    print(f"  {result.summary}")

    if args.output:
        save_extraction_results(result, args.output)


if __name__ == "__main__":
    import asyncio
    asyncio.run(main())