#!/usr/bin/env python3
"""
YouTube to Genesis Memory Pipeline
====================================
Reads today's transcripts from PostgreSQL, extracts insights via Gemini Flash,
and commits knowledge to all Genesis memory systems.

Memory targets:
    1. PostgreSQL (structured records -- extracted_insights column)
    2. Qdrant (semantic embeddings -- already done by extractor, verified here)
    3. Supermemory (cross-session memory for key insights)
    4. Daily digest generation for Telegram notification

Usage:
    # Process today's transcripts
    python youtube_to_genesis_memory.py

    # Process a specific date
    python youtube_to_genesis_memory.py --date 2026-02-15

    # Dry run (extract insights but don't commit)
    python youtube_to_genesis_memory.py --dry-run

    # Skip Gemini extraction (just commit raw transcripts)
    python youtube_to_genesis_memory.py --skip-extraction

Author: Genesis System
Version: 1.0.0
"""

import argparse
import json
import logging
import os
import subprocess
import sys
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple

# Add Genesis paths
sys.path.insert(0, "/mnt/e/genesis-system/data/genesis-memory")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger("yt_memory_pipeline")

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

GEMINI_API_KEY = os.environ.get(
    "GEMINI_API_KEY",
    "AIzaSyCT_rx0NusUJWoqtT7uxHAKEfHo129SJb8"
)
GEMINI_MODEL = "gemini-2.0-flash"

SUPERMEMORY_API_KEY = os.environ.get(
    "SUPERMEMORY_API_KEY",
    "sm_EWRhbQPEodMHkJd8Vbshpx_wCauANQAwJFvFfTwTTrujWzHTQajuJPRJLFwavESILxQZpmDiqfIbDAAfGCffQQb"
)
SUPERMEMORY_CONTAINER = "genesis-kinan"

DIGEST_OUTPUT_DIR = Path("/mnt/e/genesis-system/data/youtube_digests")


# ---------------------------------------------------------------------------
# Database layer
# ---------------------------------------------------------------------------

def get_db_connection():
    """Get PostgreSQL connection using Elestio config."""
    import psycopg2
    from elestio_config import PostgresConfig
    return psycopg2.connect(**PostgresConfig.get_connection_params())


def get_transcripts_for_date(conn, date_str: Optional[str] = None) -> List[Dict[str, Any]]:
    """Load transcripts joined with watch history for a given date."""
    if date_str:
        target = datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
    else:
        target = datetime.now(timezone.utc)

    start = target.replace(hour=0, minute=0, second=0, microsecond=0)
    end = start + timedelta(days=1)

    records = []
    with conn.cursor() as cur:
        cur.execute("""
            SELECT
                wh.video_id,
                wh.title,
                wh.channel_name,
                wh.channel_id,
                wh.watched_at,
                wh.duration_seconds,
                t.transcript,
                t.language,
                t.word_count,
                t.extraction_method,
                t.extracted_insights
            FROM yt_watch_history wh
            LEFT JOIN yt_transcripts t ON wh.video_id = t.video_id
            WHERE wh.watched_at >= %s AND wh.watched_at < %s
            ORDER BY wh.watched_at
        """, (start, end))

        for row in cur.fetchall():
            records.append({
                "video_id": row[0],
                "title": row[1],
                "channel_name": row[2],
                "channel_id": row[3],
                "watched_at": row[4].isoformat() if row[4] else None,
                "duration_seconds": row[5],
                "transcript": row[6],
                "language": row[7],
                "word_count": row[8],
                "extraction_method": row[9],
                "extracted_insights": row[10] if row[10] else {},
            })

    logger.info(f"Loaded {len(records)} records for {start.date()}")
    return records


def update_insights(conn, video_id: str, insights: Dict[str, Any]):
    """Update the extracted_insights column for a transcript."""
    try:
        with conn.cursor() as cur:
            cur.execute("""
                UPDATE yt_transcripts
                SET extracted_insights = %s
                WHERE video_id = %s
            """, (json.dumps(insights), video_id))
            conn.commit()
    except Exception as e:
        logger.error(f"Failed to update insights for {video_id}: {e}")
        conn.rollback()


def update_topics(conn, video_id: str, topics: List[str]):
    """Update the extracted_topics column for a transcript."""
    try:
        with conn.cursor() as cur:
            cur.execute("""
                UPDATE yt_transcripts
                SET extracted_topics = %s
                WHERE video_id = %s
            """, (topics, video_id))
            conn.commit()
    except Exception as e:
        logger.error(f"Failed to update topics for {video_id}: {e}")
        conn.rollback()


# ---------------------------------------------------------------------------
# Gemini Flash insight extraction
# ---------------------------------------------------------------------------

def extract_insights_with_gemini(
    video_id: str,
    title: str,
    channel: str,
    transcript: str,
    max_transcript_chars: int = 30000,
) -> Optional[Dict[str, Any]]:
    """
    Use Gemini Flash to extract structured insights from a transcript.

    Returns dict with:
        - summary: 2-3 sentence summary
        - topics: list of topics/categories
        - key_insights: list of actionable insights
        - decisions: any decisions or recommendations mentioned
        - entities: people, companies, tools mentioned
        - relevance_score: 1-10 relevance to Genesis/business
        - action_items: specific things to follow up on
    """
    try:
        import google.generativeai as genai
    except ImportError:
        logger.error("google-generativeai required: pip install google-generativeai")
        return None

    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel(GEMINI_MODEL)

    # Truncate very long transcripts
    truncated = transcript[:max_transcript_chars]
    if len(transcript) > max_transcript_chars:
        truncated += f"\n\n[TRUNCATED - full transcript is {len(transcript)} chars]"

    prompt = f"""You are an insight extraction engine for a knowledge management system.

Analyze this YouTube video transcript and extract structured insights.

**Video**: "{title}" by {channel}
**Video ID**: {video_id}

**Transcript**:
{truncated}

---

Return a JSON object with EXACTLY these fields:

{{
    "summary": "2-3 sentence summary of the video content and its key message",
    "topics": ["list", "of", "main", "topics", "discussed"],
    "key_insights": [
        "Insight 1: specific actionable takeaway",
        "Insight 2: another key learning",
        "..."
    ],
    "decisions": ["Any decisions, recommendations, or strategic advice mentioned"],
    "entities": {{
        "people": ["Names of people mentioned"],
        "companies": ["Companies or brands mentioned"],
        "tools": ["Software, tools, platforms mentioned"],
        "concepts": ["Key concepts, frameworks, methodologies"]
    }},
    "relevance_to_business": "How this relates to AI agents, SaaS, Australian market, or business growth (1-2 sentences)",
    "relevance_score": 7,
    "action_items": ["Specific follow-up actions based on the content"]
}}

IMPORTANT:
- Return ONLY valid JSON, no markdown or explanation
- Be specific and actionable in insights
- Rate relevance_score 1-10 based on relevance to: AI agents, SaaS businesses, Australian market, voice AI, automation
- Extract ALL named entities (people, companies, tools)
- Keep summary concise but informative
"""

    try:
        response = model.generate_content(
            prompt,
            generation_config={
                "temperature": 0.2,
                "max_output_tokens": 2000,
                "response_mime_type": "application/json",
            },
        )

        result_text = response.text.strip()

        # Clean up potential markdown wrapping
        if result_text.startswith("```"):
            result_text = result_text.split("\n", 1)[1]
            if result_text.endswith("```"):
                result_text = result_text[:-3]
            result_text = result_text.strip()

        insights = json.loads(result_text)
        insights["extraction_model"] = GEMINI_MODEL
        insights["extracted_at"] = datetime.now(timezone.utc).isoformat()

        logger.info(
            f"  Extracted insights for {video_id}: "
            f"{len(insights.get('key_insights', []))} insights, "
            f"relevance={insights.get('relevance_score', '?')}/10"
        )
        return insights

    except json.JSONDecodeError as e:
        logger.error(f"Gemini returned invalid JSON for {video_id}: {e}")
        logger.debug(f"Raw response: {response.text[:500]}")
        return None
    except Exception as e:
        logger.error(f"Gemini extraction failed for {video_id}: {e}")
        return None


# ---------------------------------------------------------------------------
# Supermemory commitment
# ---------------------------------------------------------------------------

def commit_to_supermemory(content: str, tag: str = "youtube_insight") -> bool:
    """Save content to Supermemory via the API."""
    import requests

    try:
        payload = {
            "content": f"[{tag}] {content}",
            "containerTags": [SUPERMEMORY_CONTAINER],
        }

        response = requests.post(
            "https://api.supermemory.ai/v3/memories",
            headers={
                "Authorization": f"Bearer {SUPERMEMORY_API_KEY}",
                "Content-Type": "application/json",
            },
            json=payload,
            timeout=30,
        )

        if response.status_code in (200, 201):
            logger.debug(f"Supermemory save OK: {content[:80]}...")
            return True
        else:
            logger.warning(f"Supermemory error {response.status_code}: {response.text[:200]}")
            return False

    except Exception as e:
        logger.error(f"Supermemory commit failed: {e}")
        return False


# ---------------------------------------------------------------------------
# Daily digest generation
# ---------------------------------------------------------------------------

def generate_daily_digest(
    date_str: str,
    records: List[Dict[str, Any]],
    all_insights: Dict[str, Dict[str, Any]],
) -> str:
    """Generate a human-readable daily digest of YouTube watching activity."""
    total = len(records)
    with_transcripts = sum(1 for r in records if r.get("transcript"))
    with_insights = len(all_insights)

    # Collect topics and insights across all videos
    all_topics = set()
    all_key_insights = []
    all_action_items = []
    high_relevance = []

    for vid, insights in all_insights.items():
        topics = insights.get("topics", [])
        all_topics.update(topics)

        for insight in insights.get("key_insights", [])[:3]:
            rec = next((r for r in records if r["video_id"] == vid), {})
            all_key_insights.append(f"- [{rec.get('title', vid)}] {insight}")

        for action in insights.get("action_items", []):
            all_action_items.append(f"- {action}")

        score = insights.get("relevance_score", 0)
        if score >= 7:
            rec = next((r for r in records if r["video_id"] == vid), {})
            high_relevance.append({
                "title": rec.get("title", vid),
                "channel": rec.get("channel_name", ""),
                "score": score,
                "summary": insights.get("summary", ""),
            })

    # Total watch time
    total_seconds = sum(r.get("duration_seconds") or 0 for r in records)
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60

    # Build digest
    lines = []
    lines.append(f"# YouTube Watch Digest - {date_str}")
    lines.append("")
    lines.append(f"**Videos watched**: {total}")
    lines.append(f"**Transcripts extracted**: {with_transcripts}")
    lines.append(f"**Insights generated**: {with_insights}")
    lines.append(f"**Total watch time**: {hours}h {minutes}m")
    lines.append("")

    if all_topics:
        lines.append(f"## Topics")
        lines.append(", ".join(sorted(all_topics)[:20]))
        lines.append("")

    if high_relevance:
        lines.append("## High-Relevance Videos (7+/10)")
        for hr in sorted(high_relevance, key=lambda x: -x["score"]):
            lines.append(f"- **{hr['title']}** ({hr['channel']}) - Score: {hr['score']}/10")
            lines.append(f"  {hr['summary']}")
        lines.append("")

    if all_key_insights:
        lines.append("## Key Insights")
        for insight in all_key_insights[:15]:
            lines.append(insight)
        lines.append("")

    if all_action_items:
        lines.append("## Action Items")
        for action in all_action_items[:10]:
            lines.append(action)
        lines.append("")

    # Video list
    lines.append("## All Videos")
    for rec in records:
        vid = rec["video_id"]
        title = rec.get("title", "Unknown")
        channel = rec.get("channel_name", "")
        dur = rec.get("duration_seconds")
        dur_str = f" ({dur // 60}m)" if dur else ""
        transcript_status = "T" if rec.get("transcript") else "-"
        insight_status = "I" if vid in all_insights else "-"
        lines.append(f"- [{transcript_status}{insight_status}] **{title}** - {channel}{dur_str}")

    digest = "\n".join(lines)
    return digest


def generate_telegram_digest(
    date_str: str,
    records: List[Dict[str, Any]],
    all_insights: Dict[str, Dict[str, Any]],
) -> str:
    """Generate a compact Telegram-friendly digest message."""
    total = len(records)
    with_insights = len(all_insights)

    # Collect high-relevance videos
    high_relevance = []
    all_topics = set()
    for vid, insights in all_insights.items():
        all_topics.update(insights.get("topics", []))
        score = insights.get("relevance_score", 0)
        if score >= 7:
            rec = next((r for r in records if r["video_id"] == vid), {})
            high_relevance.append({
                "title": rec.get("title", vid)[:60],
                "score": score,
            })

    total_seconds = sum(r.get("duration_seconds") or 0 for r in records)
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60

    msg_lines = [
        f"YouTube Digest - {date_str}",
        f"{total} videos | {hours}h{minutes}m | {with_insights} insights",
        "",
    ]

    if all_topics:
        msg_lines.append(f"Topics: {', '.join(sorted(all_topics)[:8])}")
        msg_lines.append("")

    if high_relevance:
        msg_lines.append("High relevance:")
        for hr in sorted(high_relevance, key=lambda x: -x["score"])[:5]:
            msg_lines.append(f"  {hr['score']}/10 - {hr['title']}")
        msg_lines.append("")

    msg_lines.append("Full digest saved to Genesis memory.")

    return "\n".join(msg_lines)


# ---------------------------------------------------------------------------
# Main pipeline
# ---------------------------------------------------------------------------

def run_pipeline(
    date_str: Optional[str] = None,
    dry_run: bool = False,
    skip_extraction: bool = False,
    skip_supermemory: bool = False,
) -> Dict[str, Any]:
    """
    Run the full memory commitment pipeline.

    Steps:
    1. Load today's transcripts from PostgreSQL
    2. Extract insights via Gemini Flash
    3. Update insights in PostgreSQL
    4. Commit key insights to Supermemory
    5. Generate daily digest
    """
    if date_str is None:
        date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")

    conn = get_db_connection()
    try:
        # Step 1: Load transcripts
        records = get_transcripts_for_date(conn, date_str)
        if not records:
            logger.info(f"No watch history found for {date_str}")
            return {"date": date_str, "videos": 0, "status": "no_data"}

        # Step 2: Extract insights
        all_insights = {}

        if not skip_extraction:
            for i, rec in enumerate(records, 1):
                vid = rec["video_id"]

                # Skip if no transcript
                if not rec.get("transcript"):
                    logger.info(f"[{i}/{len(records)}] {vid} - No transcript, skipping insights")
                    continue

                # Skip if insights already exist
                if rec.get("extracted_insights") and isinstance(rec["extracted_insights"], dict):
                    if rec["extracted_insights"].get("key_insights"):
                        logger.info(f"[{i}/{len(records)}] {vid} - Insights already exist, skipping")
                        all_insights[vid] = rec["extracted_insights"]
                        continue

                logger.info(f"[{i}/{len(records)}] Extracting insights for: {rec.get('title', vid)}")

                insights = extract_insights_with_gemini(
                    video_id=vid,
                    title=rec.get("title", ""),
                    channel=rec.get("channel_name", ""),
                    transcript=rec["transcript"],
                )

                if insights:
                    all_insights[vid] = insights

                    if not dry_run:
                        # Step 3: Update PostgreSQL
                        update_insights(conn, vid, insights)
                        topics = insights.get("topics", [])
                        if topics:
                            update_topics(conn, vid, topics)

                    # Step 4: Commit to Supermemory
                    if not dry_run and not skip_supermemory:
                        # Save high-relevance insights
                        score = insights.get("relevance_score", 0)
                        if score >= 6:
                            summary = insights.get("summary", "")
                            key_insights_text = "; ".join(insights.get("key_insights", [])[:3])
                            memory_content = (
                                f"YouTube: \"{rec.get('title', vid)}\" by {rec.get('channel_name', '')}. "
                                f"Summary: {summary} "
                                f"Key insights: {key_insights_text}"
                            )
                            commit_to_supermemory(memory_content, tag="youtube_daily")

                # Rate limit: Gemini Flash
                if i < len(records):
                    time.sleep(1)
        else:
            # Load existing insights
            for rec in records:
                if rec.get("extracted_insights") and isinstance(rec["extracted_insights"], dict):
                    all_insights[rec["video_id"]] = rec["extracted_insights"]

        # Step 5: Generate digest
        full_digest = generate_daily_digest(date_str, records, all_insights)
        telegram_digest = generate_telegram_digest(date_str, records, all_insights)

        if not dry_run:
            # Save digest to file
            DIGEST_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
            digest_path = DIGEST_OUTPUT_DIR / f"digest_{date_str}.md"
            with open(digest_path, "w", encoding="utf-8") as f:
                f.write(full_digest)
            logger.info(f"Digest saved to {digest_path}")

            # Commit digest summary to Supermemory
            if not skip_supermemory:
                commit_to_supermemory(
                    f"Daily YouTube Digest ({date_str}): Watched {len(records)} videos. "
                    f"Key topics: {', '.join(sorted({t for i in all_insights.values() for t in i.get('topics', [])})[:10])}. "
                    f"Generated {len(all_insights)} insight reports.",
                    tag="youtube_digest"
                )

        result = {
            "date": date_str,
            "videos_watched": len(records),
            "transcripts_available": sum(1 for r in records if r.get("transcript")),
            "insights_extracted": len(all_insights),
            "supermemory_commits": sum(
                1 for i in all_insights.values()
                if i.get("relevance_score", 0) >= 6
            ),
            "telegram_digest": telegram_digest,
            "digest_path": str(DIGEST_OUTPUT_DIR / f"digest_{date_str}.md"),
        }

        logger.info(f"Pipeline complete: {json.dumps(result, indent=2, default=str)}")
        return result

    finally:
        conn.close()


def main():
    parser = argparse.ArgumentParser(
        description="YouTube to Genesis Memory commitment pipeline"
    )
    parser.add_argument(
        "--date",
        help="Process a specific date (YYYY-MM-DD). Default: today"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Extract insights but don't commit to memory systems"
    )
    parser.add_argument(
        "--skip-extraction",
        action="store_true",
        help="Skip Gemini insight extraction (use existing insights only)"
    )
    parser.add_argument(
        "--skip-supermemory",
        action="store_true",
        help="Skip Supermemory commits"
    )

    args = parser.parse_args()

    result = run_pipeline(
        date_str=args.date,
        dry_run=args.dry_run,
        skip_extraction=args.skip_extraction,
        skip_supermemory=args.skip_supermemory,
    )

    print(json.dumps(result, indent=2, default=str))


if __name__ == "__main__":
    main()
