#!/usr/bin/env python3
"""
YouTube KG Builder
===================
Takes processed YouTube video transcripts and extracts structured Knowledge Graph
entities — tools, tactics, prices, workflows, quotes, and frameworks.

Pipeline:
    YouTubeIntelligence.process_video()
        → YouTubeKGBuilder.extract_entities()
            → KG entities (JSONL)
            → Summary document (MD in .gemini/knowledge/)

Entity types extracted:
    - tool         : Software, apps, platforms mentioned
    - tactic       : Strategies, methods, techniques
    - price        : Pricing data, costs, margins, fees
    - workflow     : Step-by-step processes
    - quote        : Key quotes worth preserving verbatim
    - insight      : High-value one-liner takeaways
    - framework    : Conceptual models and decision frameworks
    - case_study   : Real examples with numbers/outcomes

Output:
    KG entities:  E:/genesis-system/KNOWLEDGE_GRAPH/entities/youtube_intel_{date}.jsonl
    KB summary:   E:/genesis-system/.gemini/knowledge/{channel}_{video_id}_KB.md

NO SQLite. All storage is JSONL flat files.

Usage:
    from core.youtube_kg_builder import YouTubeKGBuilder
    from core.youtube_intelligence import YouTubeIntelligence

    yt = YouTubeIntelligence()
    raw = yt.process_video("https://youtu.be/VIDEO_ID")

    builder = YouTubeKGBuilder()
    entities = builder.extract_entities(raw, channel_context="nick_pontes_ghl")
    builder.save_entities(entities)
    builder.save_kb_summary(entities, raw)
"""

import json
import re
import hashlib
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional

# ── Storage paths ──────────────────────────────────────────────────────────────
BASE_PATH = Path("E:/genesis-system")
KG_ENTITIES_DIR = BASE_PATH / "KNOWLEDGE_GRAPH" / "entities"
KB_SUMMARY_DIR = BASE_PATH / ".gemini" / "knowledge"
PROCESSED_DIR = BASE_PATH / "data" / "youtube_knowledge_base" / "processed"


class YouTubeKGBuilder:
    """
    Extracts structured KG entities from YouTube video transcripts.

    Uses pattern matching + heuristic extraction. For deeper extraction,
    pass the transcript through Gemini/Claude via the RLM pipeline.
    """

    def __init__(self, base_path: Optional[str] = None):
        self.base = Path(base_path) if base_path else BASE_PATH
        self.kg_dir = self.base / "KNOWLEDGE_GRAPH" / "entities"
        self.kb_dir = self.base / ".gemini" / "knowledge"
        self._ensure_dirs()

    def _ensure_dirs(self):
        for d in [self.kg_dir, self.kb_dir]:
            d.mkdir(parents=True, exist_ok=True)

    def _log(self, msg: str):
        ts = datetime.now().strftime("%H:%M:%S")
        print(f"[KG] {ts} {msg}")

    # ── Entity ID generation ───────────────────────────────────────────────────

    def _make_entity_id(self, video_id: str, entity_type: str, index: int) -> str:
        """Generate a stable entity ID: YT-{type}-{video_short}-{index}"""
        short = video_id[:6].upper()
        type_code = entity_type[:3].upper()
        return f"YT-{type_code}-{short}-{index:03d}"

    def _entity_hash(self, content: str) -> str:
        """Short hash for deduplication."""
        return hashlib.md5(content.encode()).hexdigest()[:8]

    # ── Pattern-based extraction ───────────────────────────────────────────────

    def _extract_prices(self, text: str) -> List[Dict[str, str]]:
        """Extract price mentions from transcript text."""
        prices = []

        # Patterns: $97/mo, $497/month, $1,500, AUD $299, 30% recurring
        patterns = [
            r'\$[\d,]+(?:\.\d{2})?\s*(?:per\s+)?(?:per\s+)?(?:mo(?:nth)?|yr|year|day|week|month)?',
            r'AUD\s*\$?[\d,]+',
            r'[\d]+%\s*(?:commission|margin|markup|recurring|off|discount)',
            r'[\d,]+\s*(?:USD|AUD|dollars?)\s*(?:per\s+)?(?:mo(?:nth)?|year)?',
        ]

        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                clean = match.strip()
                if clean and len(clean) > 2:
                    prices.append({"raw": clean, "context": ""})

        return prices[:20]  # Cap at 20 price mentions

    def _extract_tools(self, text: str) -> List[str]:
        """Extract known tool/platform names from transcript."""
        known_tools = [
            "GoHighLevel", "GHL", "HighLevel", "High Level",
            "Telnyx", "Vapi", "Bland AI", "Retell AI", "ElevenLabs",
            "Twilio", "UpHex", "FormWise", "Extendly", "HL Pro Tools",
            "Growthable", "Loom", "Calendly", "n8n", "Zapier",
            "Gemini", "ChatGPT", "OpenAI", "Claude", "Anthropic",
            "Stripe", "PayPal", "Xero", "QuickBooks",
            "Google Business Profile", "Google Maps", "Perplexity",
            "Hootsuite", "Buffer", "Podium", "Birdeye",
            "WordPress", "Elementor", "Wix", "Squarespace",
            "Facebook Ads", "Google Ads", "Meta Ads",
            "Mailchimp", "ActiveCampaign", "HubSpot",
            "Slack", "Notion", "ClickUp", "Asana",
            "YouTube", "TikTok", "Instagram", "LinkedIn"
        ]

        found = []
        text_lower = text.lower()
        for tool in known_tools:
            if tool.lower() in text_lower:
                found.append(tool)

        return list(set(found))

    def _extract_numbers(self, text: str) -> List[str]:
        """Extract key numbers that represent scale/metrics."""
        # Look for numbers with context: "87 clients", "30 days", "41% close rate"
        patterns = [
            r'(\d+)\s*(?:clients?|customers?|accounts?|users?)',
            r'(\d+)\s*(?:days?|weeks?|months?|years?)',
            r'(\d+)%\s*(?:close rate|conversion|churn|margin|commission|satisfaction)',
            r'(\d[\d,]*)\s*(?:subscribers?|members?|followers?)',
            r'MRR\s*(?:of\s*)?\$[\d,]+',
        ]
        numbers = []
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            numbers.extend(str(m) for m in matches)
        return list(set(numbers))[:10]

    def _extract_key_sentences(self, text: str, n: int = 10) -> List[str]:
        """
        Extract high-value sentences using simple heuristics:
        - Contains a number + dollar sign (pricing signal)
        - Contains "you should", "the key is", "most important", "biggest mistake"
        - Short, punchy statements under 120 chars
        """
        sentences = re.split(r'(?<=[.!?])\s+', text)

        scored = []
        signal_words = [
            "you should", "the key", "most important", "biggest", "mistake",
            "secret", "formula", "strategy", "never", "always", "every",
            "the reason", "that's why", "here's the thing", "the truth",
            "what i", "what we", "i learned", "discovered", "realized",
            "result", "outcome", "close rate", "conversion", "revenue",
            "per month", "per year", "annually", "clients", "mrr"
        ]

        for sent in sentences:
            if not sent.strip():
                continue
            score = 0
            sent_lower = sent.lower()

            # Length sweet spot: 40-180 chars
            if 40 <= len(sent) <= 180:
                score += 1

            # Contains dollar amount
            if re.search(r'\$[\d,]+', sent):
                score += 2

            # Contains signal words
            for word in signal_words:
                if word in sent_lower:
                    score += 1

            # Contains percentage
            if re.search(r'\d+%', sent):
                score += 1

            if score >= 2:
                scored.append((score, sent.strip()))

        # Sort by score, return top n
        scored.sort(key=lambda x: x[0], reverse=True)
        return [s[1] for s in scored[:n]]

    # ── Main entity extraction ─────────────────────────────────────────────────

    def extract_entities(
        self,
        video_data: Dict[str, Any],
        channel_context: str = "youtube_intel",
        source_label: Optional[str] = None
    ) -> List[Dict[str, Any]]:
        """
        Extract KG entities from a processed video result.

        Args:
            video_data: Result from YouTubeIntelligence.process_video()
            channel_context: Short label for the channel (e.g., "nick_pontes_ghl")
            source_label: Human-readable source name override

        Returns list of KG entity dicts ready to write to JSONL.
        """
        video_id = video_data.get("video_id", "unknown")
        title = video_data.get("title", "Unknown")
        channel = video_data.get("channel", "Unknown")
        upload_date = video_data.get("upload_date_iso", datetime.now().strftime("%Y-%m-%d"))
        url = video_data.get("url", f"https://youtube.com/watch?v={video_id}")
        full_text = video_data.get("full_text", "")
        source = source_label or f"{channel_context}/{video_id}"

        if not full_text:
            self._log(f"No transcript for {video_id} — skipping entity extraction")
            return []

        self._log(f"Extracting entities from: {title[:60]}")

        entities = []
        entity_index = 1

        # ── 1. Video metadata entity ───────────────────────────────────────────
        meta_entity = {
            "id": self._make_entity_id(video_id, "video", entity_index),
            "type": "video_source",
            "name": f"YT_{video_id}_{channel_context}",
            "source": source,
            "date": upload_date,
            "channel": channel,
            "channel_context": channel_context,
            "video_id": video_id,
            "video_title": title,
            "video_url": url,
            "word_count": video_data.get("word_count", 0),
            "duration": video_data.get("duration_formatted", ""),
            "view_count": video_data.get("view_count", 0),
            "tags": video_data.get("tags", [])[:10],
            "ingested_at": datetime.utcnow().isoformat() + "Z",
            "confidence": 1.0
        }
        entities.append(meta_entity)
        entity_index += 1

        # ── 2. Tool entities ───────────────────────────────────────────────────
        tools = self._extract_tools(full_text)
        if tools:
            tools_entity = {
                "id": self._make_entity_id(video_id, "tool", entity_index),
                "type": "tools_mentioned",
                "name": f"Tools_{video_id}",
                "source": source,
                "date": upload_date,
                "video_id": video_id,
                "video_title": title,
                "video_url": url,
                "tools": tools,
                "summary": f"Tools/platforms mentioned in '{title}': {', '.join(tools)}",
                "confidence": 0.85,
                "tags": [channel_context, "tools", "platforms"]
            }
            entities.append(tools_entity)
            entity_index += 1
            self._log(f"  Found {len(tools)} tools: {', '.join(tools[:5])}")

        # ── 3. Price/revenue entities ──────────────────────────────────────────
        prices = self._extract_prices(full_text)
        if prices:
            # Build contextual price extract around each price mention
            price_contexts = []
            for price_data in prices[:10]:
                raw_price = price_data["raw"]
                # Find the sentence containing this price
                for sentence in re.split(r'(?<=[.!?])\s+', full_text):
                    if raw_price.lower() in sentence.lower() and len(sentence) < 300:
                        price_contexts.append(sentence.strip())
                        break

            if price_contexts:
                price_entity = {
                    "id": self._make_entity_id(video_id, "price", entity_index),
                    "type": "pricing_data",
                    "name": f"Pricing_{video_id}",
                    "source": source,
                    "date": upload_date,
                    "video_id": video_id,
                    "video_title": title,
                    "video_url": url,
                    "price_mentions": prices[:10],
                    "price_contexts": price_contexts[:5],
                    "summary": f"Pricing data from '{title}': {'; '.join(price_contexts[:3])}",
                    "confidence": 0.80,
                    "tags": [channel_context, "pricing", "revenue"]
                }
                entities.append(price_entity)
                entity_index += 1
                self._log(f"  Found {len(prices)} price mentions")

        # ── 4. Key insight entities ────────────────────────────────────────────
        key_sentences = self._extract_key_sentences(full_text, n=8)
        if key_sentences:
            for i, sentence in enumerate(key_sentences[:5]):
                insight_entity = {
                    "id": self._make_entity_id(video_id, "insight", entity_index),
                    "type": "insight",
                    "name": f"Insight_{video_id}_{i+1:02d}",
                    "source": source,
                    "date": upload_date,
                    "video_id": video_id,
                    "video_title": title,
                    "video_url": url,
                    "summary": sentence,
                    "key_facts": [sentence],
                    "confidence": 0.75,
                    "tags": [channel_context, "insight", "tactic"]
                }
                entities.append(insight_entity)
                entity_index += 1

            self._log(f"  Extracted {min(5, len(key_sentences))} key insights")

        # ── 5. Numbers/metrics entity ──────────────────────────────────────────
        numbers = self._extract_numbers(full_text)
        if numbers:
            numbers_entity = {
                "id": self._make_entity_id(video_id, "metric", entity_index),
                "type": "metrics",
                "name": f"Metrics_{video_id}",
                "source": source,
                "date": upload_date,
                "video_id": video_id,
                "video_title": title,
                "video_url": url,
                "key_numbers": numbers,
                "summary": f"Key numbers from '{title}': {', '.join(numbers[:5])}",
                "confidence": 0.82,
                "tags": [channel_context, "metrics", "social_proof"]
            }
            entities.append(numbers_entity)
            entity_index += 1
            self._log(f"  Found {len(numbers)} key metrics")

        self._log(f"  Total entities extracted: {len(entities)}")
        return entities

    # ── Batch extraction ───────────────────────────────────────────────────────

    def extract_from_batch(
        self,
        video_data_list: List[Dict[str, Any]],
        channel_context: str = "youtube_intel"
    ) -> List[Dict[str, Any]]:
        """Extract entities from a list of processed videos (channel batch)."""
        all_entities = []
        for video_data in video_data_list:
            if video_data.get("error"):
                continue
            entities = self.extract_entities(video_data, channel_context=channel_context)
            all_entities.extend(entities)
        return all_entities

    # ── Save to JSONL ──────────────────────────────────────────────────────────

    def save_entities(
        self,
        entities: List[Dict[str, Any]],
        filename: Optional[str] = None
    ) -> Path:
        """
        Append entities to a dated JSONL file in KNOWLEDGE_GRAPH/entities/.
        Returns the path written to.
        """
        if not entities:
            self._log("No entities to save")
            return self.kg_dir

        today = datetime.now().strftime("%Y_%m_%d")
        fname = filename or f"youtube_intel_{today}.jsonl"
        out_path = self.kg_dir / fname

        with open(out_path, "a", encoding="utf-8") as f:
            for entity in entities:
                f.write(json.dumps(entity, ensure_ascii=False) + "\n")

        self._log(f"Saved {len(entities)} entities to: {out_path}")
        return out_path

    def save_to_named_file(
        self,
        entities: List[Dict[str, Any]],
        name: str
    ) -> Path:
        """Save entities to a specifically named JSONL file (e.g., nick_pontes_sample.jsonl)."""
        out_path = self.kg_dir / name
        with open(out_path, "w", encoding="utf-8") as f:
            for entity in entities:
                f.write(json.dumps(entity, ensure_ascii=False) + "\n")
        self._log(f"Saved {len(entities)} entities to: {out_path}")
        return out_path

    # ── KB Summary document ────────────────────────────────────────────────────

    def save_kb_summary(
        self,
        entities: List[Dict[str, Any]],
        video_data: Dict[str, Any],
        channel_context: str = "youtube_intel"
    ) -> Path:
        """
        Generate a human-readable KB summary document from extracted entities.
        Saved to .gemini/knowledge/{channel_context}_{video_id}_KB.md
        """
        video_id = video_data.get("video_id", "unknown")
        title = video_data.get("title", "Unknown")
        channel = video_data.get("channel", "Unknown")
        url = video_data.get("url", "")
        date = video_data.get("upload_date_iso", "")
        duration = video_data.get("duration_formatted", "")
        word_count = video_data.get("word_count", 0)

        # Pull out entity types
        tools_entity = next((e for e in entities if e.get("type") == "tools_mentioned"), None)
        price_entity = next((e for e in entities if e.get("type") == "pricing_data"), None)
        insights = [e for e in entities if e.get("type") == "insight"]
        metrics_entity = next((e for e in entities if e.get("type") == "metrics"), None)

        safe_context = re.sub(r'[^a-zA-Z0-9_]', '_', channel_context)
        out_path = self.kb_dir / f"{safe_context}_{video_id}_KB.md"

        lines = [
            f"# Knowledge Base: {title}",
            "",
            f"**Channel**: {channel}",
            f"**Date**: {date}",
            f"**Duration**: {duration}",
            f"**Words**: {word_count:,}",
            f"**URL**: {url}",
            f"**Context**: {channel_context}",
            f"**Ingested**: {datetime.now().strftime('%Y-%m-%d')}",
            "",
            "---",
            "",
        ]

        if tools_entity:
            lines += [
                "## Tools & Platforms Mentioned",
                "",
                ", ".join(tools_entity.get("tools", [])),
                "",
            ]

        if price_entity:
            lines += [
                "## Pricing Data",
                "",
            ]
            for ctx in price_entity.get("price_contexts", []):
                lines.append(f"- {ctx}")
            lines.append("")

        if insights:
            lines += [
                "## Key Insights",
                "",
            ]
            for ins in insights:
                summary = ins.get("summary", "")
                if summary:
                    lines.append(f"- {summary}")
            lines.append("")

        if metrics_entity:
            lines += [
                "## Key Numbers & Metrics",
                "",
            ]
            for num in metrics_entity.get("key_numbers", []):
                lines.append(f"- {num}")
            lines.append("")

        lines += [
            "## Entity IDs",
            "",
            f"Total entities extracted: {len(entities)}",
            "",
        ]
        for e in entities:
            lines.append(f"- `{e['id']}` — {e.get('type', '?')}: {e.get('name', '?')}")

        md_content = "\n".join(lines)

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(md_content)

        self._log(f"Saved KB summary: {out_path}")
        return out_path

    # ── Full pipeline ──────────────────────────────────────────────────────────

    def run_pipeline(
        self,
        video_data: Dict[str, Any],
        channel_context: str = "youtube_intel",
        output_filename: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Run the complete KG pipeline on a single processed video:
        1. Extract entities
        2. Save to JSONL
        3. Save KB summary MD

        Returns: {entities, jsonl_path, kb_path, entity_count}
        """
        entities = self.extract_entities(video_data, channel_context=channel_context)

        jsonl_path = self.save_entities(entities, filename=output_filename)
        kb_path = self.save_kb_summary(entities, video_data, channel_context=channel_context)

        return {
            "entities": entities,
            "entity_count": len(entities),
            "jsonl_path": str(jsonl_path),
            "kb_path": str(kb_path),
            "video_id": video_data.get("video_id"),
            "title": video_data.get("title")
        }

    def run_channel_pipeline(
        self,
        video_data_list: List[Dict[str, Any]],
        channel_context: str,
        output_filename: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Run the KG pipeline on an entire channel batch.
        All entities go into one JSONL file.
        """
        all_entities = self.extract_from_batch(video_data_list, channel_context=channel_context)

        today = datetime.now().strftime("%Y_%m_%d")
        fname = output_filename or f"youtube_intel_{channel_context}_{today}.jsonl"
        jsonl_path = self.save_entities(all_entities, filename=fname)

        return {
            "entities": all_entities,
            "entity_count": len(all_entities),
            "video_count": len(video_data_list),
            "jsonl_path": str(jsonl_path),
            "channel_context": channel_context
        }