#!/usr/bin/env python3
"""
YouTube Transcript Ingester
=============================
Takes a list of YouTube video URLs/IDs, fetches transcripts via youtube-transcript-api,
chunks them into ~500-token segments, embeds via Gemini, stores in Qdrant
'research_reports' collection (768-dim), and writes KG entities.

This is the INGESTION side of the YouTube Intel Pipeline. It reads from
the youtube_intel PostgreSQL table (populated by youtube_watchlist_scraper.py)
or accepts explicit video IDs/URLs from the CLI.

Storage targets:
    - Qdrant:      'research_reports' collection (768-dim cosine, nomic-embed / Gemini)
    - PostgreSQL:  youtube_intel.transcript_ingested = TRUE (marks completion)
    - KG entities: E:/genesis-system/KNOWLEDGE_GRAPH/entities/youtube_intel_YYYY-MM-DD.jsonl

Usage:
    # Ingest all pending (not yet ingested) videos from youtube_intel table
    python youtube_transcript_ingester.py --pending

    # Ingest Watch Later priority first (Kinan flagged = important)
    python youtube_transcript_ingester.py --pending --source watchlater

    # Ingest specific video URLs or IDs
    python youtube_transcript_ingester.py --urls https://youtu.be/dQw4w9WgXcQ abc123xyz

    # Ingest from a JSON file of video IDs (output from scraper)
    python youtube_transcript_ingester.py --from-json scraped_videos.json

    # Dry run (fetch transcripts, show stats, don't store)
    python youtube_transcript_ingester.py --pending --dry-run

    # Skip Qdrant, only write KG entities (useful for testing)
    python youtube_transcript_ingester.py --pending --no-qdrant

Author: Genesis System
Version: 1.0.0
Date: 2026-02-23
"""

import argparse
import hashlib
import json
import logging
import os
import re
import struct
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# ---------------------------------------------------------------------------
# Genesis path setup
# ---------------------------------------------------------------------------
GENESIS_ROOT = Path("E:/genesis-system")
KG_ENTITIES_DIR = GENESIS_ROOT / "KNOWLEDGE_GRAPH" / "entities"
sys.path.insert(0, str(GENESIS_ROOT / "data" / "genesis-memory"))

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("yt_transcript_ingester")

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
QDRANT_COLLECTION = "research_reports"   # Target collection per task spec
VECTOR_DIM = 768                         # Gemini text-embedding-004 / nomic-embed-text
CHUNK_MAX_TOKENS = 500                   # ~500 words per chunk
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyCT_rx0NusUJWoqtT7uxHAKEfHo129SJb8")
GEMINI_EMBEDDING_MODEL = "models/text-embedding-004"  # 768-dim output


# ---------------------------------------------------------------------------
# URL / ID helpers
# ---------------------------------------------------------------------------

def extract_video_id(url_or_id: str) -> Optional[str]:
    """Extract 11-char YouTube video ID from URL or return as-is if already an ID."""
    if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
        return url_or_id
    patterns = [
        r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
        r"youtube\.com/v/([a-zA-Z0-9_-]{11})",
        r"youtube\.com/shorts/([a-zA-Z0-9_-]{11})",
    ]
    for pat in patterns:
        m = re.search(pat, url_or_id)
        if m:
            return m.group(1)
    return None


def build_video_url(video_id: str) -> str:
    return f"https://www.youtube.com/watch?v={video_id}"


# ---------------------------------------------------------------------------
# Database layer
# ---------------------------------------------------------------------------

def get_db_connection():
    """Connect to Elestio PostgreSQL."""
    import psycopg2
    from elestio_config import PostgresConfig
    return psycopg2.connect(**PostgresConfig.get_connection_params())


def get_pending_records(
    conn, source: Optional[str] = None, limit: int = 500
) -> List[Dict[str, Any]]:
    """
    Fetch pending (not yet ingested) records from youtube_intel.
    Watch Later items (position priority) come first.
    """
    conditions = ["transcript_ingested = FALSE"]
    params: List[Any] = []

    if source:
        conditions.append("source = %s")
        params.append(source)

    where = " AND ".join(conditions)
    sql = f"""
        SELECT video_id, title, channel_name, source, position, video_url
        FROM youtube_intel
        WHERE {where}
        ORDER BY
            CASE source WHEN 'watch_later' THEN 0 ELSE 1 END,
            COALESCE(position, 9999) ASC
        LIMIT %s
    """
    params.append(limit)

    with conn.cursor() as cur:
        cur.execute(sql, params)
        cols = [d[0] for d in cur.description]
        return [dict(zip(cols, row)) for row in cur.fetchall()]


def mark_ingested(conn, video_id: str, source: Optional[str] = None) -> None:
    """Mark a video as transcript_ingested in youtube_intel."""
    if source:
        sql = """
            UPDATE youtube_intel
            SET transcript_ingested = TRUE
            WHERE video_id = %s AND source = %s
        """
        params = (video_id, source)
    else:
        sql = """
            UPDATE youtube_intel
            SET transcript_ingested = TRUE
            WHERE video_id = %s
        """
        params = (video_id,)

    with conn.cursor() as cur:
        cur.execute(sql, params)
    conn.commit()


# ---------------------------------------------------------------------------
# Transcript fetching (youtube-transcript-api, free, no API key)
# ---------------------------------------------------------------------------

def fetch_transcript(video_id: str) -> Optional[Dict[str, Any]]:
    """
    Fetch transcript using youtube-transcript-api.

    Returns dict with keys: video_id, transcript (str), language, word_count,
    segments (list of {text, start, duration}).
    Returns None if no transcript is available.
    """
    try:
        from youtube_transcript_api import YouTubeTranscriptApi
        from youtube_transcript_api.formatters import TextFormatter
    except ImportError:
        logger.error(
            "youtube-transcript-api is not installed.\n"
            "Fix: pip install youtube-transcript-api"
        )
        return None

    ytt = YouTubeTranscriptApi()
    transcript_data = None
    language = "en"

    # Try English variants first, then fall back to any available
    for lang_list in [["en", "en-US", "en-AU", "en-GB"], None]:
        try:
            if lang_list:
                transcript_data = ytt.fetch(video_id, languages=lang_list)
            else:
                # Get list and pick the first available transcript
                available = ytt.list(video_id)
                for t in available:
                    try:
                        transcript_data = ytt.fetch(video_id, languages=[t.language_code])
                        language = t.language_code
                        break
                    except Exception:
                        continue
            if transcript_data:
                break
        except Exception:
            if lang_list is None:
                break
            continue

    if not transcript_data:
        return None

    formatter = TextFormatter()
    full_text = formatter.format_transcript(transcript_data)

    segments = []
    for entry in transcript_data:
        segments.append({
            "text": entry.text if hasattr(entry, "text") else str(entry.get("text", "")),
            "start": entry.start if hasattr(entry, "start") else entry.get("start", 0),
            "duration": entry.duration if hasattr(entry, "duration") else entry.get("duration", 0),
        })

    return {
        "video_id": video_id,
        "transcript": full_text,
        "language": language,
        "word_count": len(full_text.split()),
        "segments": segments,
    }


# ---------------------------------------------------------------------------
# Text chunking
# ---------------------------------------------------------------------------

def chunk_text(text: str, max_words: int = CHUNK_MAX_TOKENS) -> List[str]:
    """
    Split transcript text into chunks of approximately max_words words.
    Each chunk will be its own Qdrant point.
    """
    words = text.split()
    if not words:
        return [text] if text.strip() else []
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i : i + max_words])
        if chunk.strip():
            chunks.append(chunk)
    return chunks or [text[:3000]]


# ---------------------------------------------------------------------------
# Embedding generation
# ---------------------------------------------------------------------------

def embed_texts(texts: List[str]) -> List[List[float]]:
    """
    Generate 768-dim embeddings using Gemini text-embedding-004.
    Falls back to deterministic hash-based vectors if Gemini is unavailable.
    """
    try:
        import google.generativeai as genai
        genai.configure(api_key=GEMINI_API_KEY)
        vectors = []
        for text in texts:
            result = genai.embed_content(
                model=GEMINI_EMBEDDING_MODEL,
                content=text,
                task_type="retrieval_document",
            )
            vectors.append(result["embedding"])
        return vectors
    except Exception as e:
        logger.warning(f"Gemini embedding failed ({e}). Using hash-based fallback vectors.")
        return [_hash_vector(t, dim=VECTOR_DIM) for t in texts]


def _hash_vector(text: str, dim: int = VECTOR_DIM) -> List[float]:
    """Deterministic pseudo-vector from SHA-512 hash. Last-resort fallback only."""
    h = hashlib.sha512(text.encode("utf-8")).digest()
    repeat = (dim * 4 // len(h)) + 1
    expanded = h * repeat
    values = struct.unpack(f">{dim}f", expanded[: dim * 4])
    norm = sum(v * v for v in values) ** 0.5
    if norm == 0:
        return [0.0] * dim
    return [v / norm for v in values]


def _point_id(video_id: str, chunk_idx: int) -> int:
    """Deterministic positive 64-bit integer ID for a Qdrant point."""
    raw = f"{video_id}_{chunk_idx}".encode("utf-8")
    h = hashlib.md5(raw).hexdigest()
    return int(h[:16], 16) & 0x7FFFFFFFFFFFFFFF


# ---------------------------------------------------------------------------
# Qdrant storage
# ---------------------------------------------------------------------------

def ensure_qdrant_collection(client) -> None:
    """Create 'research_reports' collection if it does not exist."""
    from qdrant_client.models import VectorParams, Distance

    existing = {c.name for c in client.get_collections().collections}
    if QDRANT_COLLECTION not in existing:
        client.create_collection(
            collection_name=QDRANT_COLLECTION,
            vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),
        )
        logger.info(f"Created Qdrant collection '{QDRANT_COLLECTION}' ({VECTOR_DIM}-dim cosine).")
    else:
        logger.debug(f"Qdrant collection '{QDRANT_COLLECTION}' already exists.")


def upsert_to_qdrant(
    client,
    video_id: str,
    chunks: List[str],
    vectors: List[List[float]],
    metadata: Dict[str, Any],
) -> int:
    """
    Upsert transcript chunks as Qdrant points.
    Returns count of points upserted.
    """
    from qdrant_client.models import PointStruct

    points = []
    source = metadata.get("source", "youtube")
    for i, (chunk, vector) in enumerate(zip(chunks, vectors)):
        points.append(
            PointStruct(
                id=_point_id(video_id, i),
                vector=vector,
                payload={
                    "video_id": video_id,
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    "text": chunk,
                    "title": metadata.get("title", ""),
                    "channel_name": metadata.get("channel_name", ""),
                    "source": source,  # 'watch_history' | 'watch_later'
                    "video_url": metadata.get("video_url", build_video_url(video_id)),
                    "language": metadata.get("language", "en"),
                    "word_count": metadata.get("word_count", 0),
                    "ingested_at": datetime.now(timezone.utc).isoformat(),
                    "collection_type": "youtube_intel",
                },
            )
        )

    if points:
        client.upsert(collection_name=QDRANT_COLLECTION, points=points)
        logger.debug(f"Upserted {len(points)} chunks for {video_id} into Qdrant.")

    return len(points)


def get_qdrant_client():
    """Get configured Qdrant client from elestio_config."""
    from qdrant_client import QdrantClient
    from elestio_config import QdrantConfig

    cfg = QdrantConfig()
    return QdrantClient(url=cfg.url, api_key=cfg.api_key)


# ---------------------------------------------------------------------------
# KG entity writer
# ---------------------------------------------------------------------------

def write_kg_entity(record: Dict[str, Any], chunks_count: int) -> None:
    """
    Write a KG entity JSONL entry for this video to
    KNOWLEDGE_GRAPH/entities/youtube_intel_YYYY-MM-DD.jsonl
    """
    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    entity_file = KG_ENTITIES_DIR / f"youtube_intel_{today}.jsonl"
    KG_ENTITIES_DIR.mkdir(parents=True, exist_ok=True)

    entity = {
        "id": f"yt_intel_{record['video_id']}_{int(time.time())}",
        "type": "youtube_video_intel",
        "video_id": record["video_id"],
        "title": record.get("title", ""),
        "channel_name": record.get("channel_name", ""),
        "video_url": record.get("video_url", build_video_url(record["video_id"])),
        "source": record.get("source", "unknown"),  # watch_history | watch_later
        "position": record.get("position"),
        "language": record.get("language", "en"),
        "word_count": record.get("word_count", 0),
        "chunks_stored": chunks_count,
        "qdrant_collection": QDRANT_COLLECTION,
        "ingested_at": datetime.now(timezone.utc).isoformat(),
        "tags": ["youtube", "intel", record.get("source", "")],
    }

    with open(entity_file, "a", encoding="utf-8") as f:
        f.write(json.dumps(entity, ensure_ascii=False) + "\n")

    logger.debug(f"KG entity written for {record['video_id']} -> {entity_file.name}")


# ---------------------------------------------------------------------------
# Main ingestion pipeline
# ---------------------------------------------------------------------------

def ingest_videos(
    video_records: List[Dict[str, Any]],
    use_qdrant: bool = True,
    dry_run: bool = False,
    conn=None,
) -> Dict[str, Any]:
    """
    Core ingestion loop.

    Args:
        video_records: List of dicts with at minimum 'video_id'. May also include
                       title, channel_name, source, position, video_url.
        use_qdrant:    Whether to embed and store in Qdrant.
        dry_run:       If True, fetch transcripts but do not store anything.
        conn:          Open PostgreSQL connection. If None, opens its own.

    Returns:
        Summary dict.
    """
    results = {
        "total": len(video_records),
        "transcripts_fetched": 0,
        "no_transcript": 0,
        "qdrant_chunks": 0,
        "kg_entities": 0,
        "failed": 0,
        "video_results": [],
    }

    qdrant_client = None
    if use_qdrant and not dry_run:
        try:
            qdrant_client = get_qdrant_client()
            ensure_qdrant_collection(qdrant_client)
        except Exception as e:
            logger.error(f"Qdrant init failed: {e}. Will skip vector storage.")
            use_qdrant = False

    owns_conn = conn is None
    if owns_conn and not dry_run:
        conn = get_db_connection()

    try:
        for i, record in enumerate(video_records, start=1):
            video_id = record.get("video_id")
            if not video_id:
                logger.warning(f"Record {i} missing video_id, skipping.")
                continue

            title = record.get("title", "")
            source = record.get("source", "unknown")
            logger.info(
                f"[{i}/{len(video_records)}] Ingesting {video_id} "
                f"({source} pos={record.get('position', '?')}): {title[:60]}"
            )

            # --- Fetch transcript ---
            transcript_data = None
            try:
                transcript_data = fetch_transcript(video_id)
            except Exception as e:
                logger.warning(f"  Transcript fetch error for {video_id}: {e}")

            if not transcript_data:
                logger.warning(f"  No transcript available for {video_id}.")
                results["no_transcript"] += 1
                results["video_results"].append({
                    "video_id": video_id,
                    "title": title,
                    "status": "no_transcript",
                })
                continue

            results["transcripts_fetched"] += 1
            word_count = transcript_data["word_count"]
            language = transcript_data["language"]
            logger.info(f"  Transcript: {word_count} words, lang={language}")

            if dry_run:
                results["video_results"].append({
                    "video_id": video_id,
                    "title": title,
                    "status": "dry_run_ok",
                    "word_count": word_count,
                    "language": language,
                })
                continue

            # --- Chunk transcript ---
            chunks = chunk_text(transcript_data["transcript"])
            logger.info(f"  Chunked into {len(chunks)} segments.")

            # --- Embed + store in Qdrant ---
            chunks_upserted = 0
            if use_qdrant and qdrant_client and chunks:
                try:
                    vectors = embed_texts(chunks)
                    meta = {
                        **record,
                        "language": language,
                        "word_count": word_count,
                    }
                    chunks_upserted = upsert_to_qdrant(
                        qdrant_client, video_id, chunks, vectors, meta
                    )
                    results["qdrant_chunks"] += chunks_upserted
                    logger.info(f"  Qdrant: {chunks_upserted} chunks stored.")
                except Exception as e:
                    logger.error(f"  Qdrant upsert failed for {video_id}: {e}")

            # --- Write KG entity ---
            try:
                kg_record = {
                    **record,
                    "language": language,
                    "word_count": word_count,
                }
                write_kg_entity(kg_record, chunks_count=chunks_upserted or len(chunks))
                results["kg_entities"] += 1
            except Exception as e:
                logger.warning(f"  KG entity write failed for {video_id}: {e}")

            # --- Mark as ingested in PostgreSQL ---
            if conn:
                try:
                    mark_ingested(conn, video_id, source=source if source != "unknown" else None)
                except Exception as e:
                    logger.warning(f"  Failed to mark {video_id} as ingested: {e}")

            results["video_results"].append({
                "video_id": video_id,
                "title": title,
                "status": "success",
                "word_count": word_count,
                "language": language,
                "chunks": chunks_upserted,
            })

            # Rate limiting to avoid youtube-transcript-api bans
            if i < len(video_records):
                time.sleep(0.4)

    finally:
        if owns_conn and conn:
            conn.close()

    return results


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Ingest YouTube transcripts into Genesis Qdrant 'research_reports' + KG.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    # Source selection
    source_group = parser.add_mutually_exclusive_group(required=True)
    source_group.add_argument(
        "--pending",
        action="store_true",
        help="Ingest all videos from youtube_intel table with transcript_ingested=FALSE",
    )
    source_group.add_argument(
        "--urls",
        nargs="+",
        metavar="URL_OR_ID",
        help="Specific YouTube URLs or video IDs to ingest",
    )
    source_group.add_argument(
        "--from-json",
        metavar="FILE",
        help="JSON file with list of video records (output from youtube_watchlist_scraper.py)",
    )

    # Filters
    parser.add_argument(
        "--source",
        choices=["watch_history", "watch_later"],
        help="Filter --pending by source (watch_history or watch_later)",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=500,
        help="Max videos to ingest when using --pending (default: 500)",
    )

    # Flags
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Fetch transcripts but do not store to Qdrant or PostgreSQL",
    )
    parser.add_argument(
        "--no-qdrant",
        action="store_true",
        help="Skip Qdrant embedding (only write KG entities)",
    )

    args = parser.parse_args()

    # Build video records list
    video_records: List[Dict[str, Any]] = []

    if args.pending:
        logger.info("Loading pending videos from youtube_intel table...")
        try:
            conn = get_db_connection()
            video_records = get_pending_records(
                conn, source=args.source, limit=args.limit
            )
            conn.close()
        except Exception as e:
            logger.error(f"Failed to load pending records: {e}")
            sys.exit(1)
        logger.info(f"Found {len(video_records)} pending videos.")

    elif args.urls:
        for url_or_id in args.urls:
            vid = extract_video_id(url_or_id)
            if vid:
                video_records.append({
                    "video_id": vid,
                    "title": "",
                    "channel_name": "",
                    "source": "manual",
                    "video_url": build_video_url(vid),
                })
            else:
                logger.warning(f"Could not extract video ID from: {url_or_id}")
        logger.info(f"Prepared {len(video_records)} videos from CLI arguments.")

    elif args.from_json:
        json_path = Path(args.from_json)
        if not json_path.exists():
            logger.error(f"JSON file not found: {json_path}")
            sys.exit(1)
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        # Accept list of records or {"video_ids": [...]} format
        if isinstance(data, list):
            video_records = data
        elif isinstance(data, dict) and "video_ids" in data:
            for vid in data["video_ids"]:
                video_records.append({
                    "video_id": vid,
                    "title": "",
                    "channel_name": "",
                    "source": "json_import",
                    "video_url": build_video_url(vid),
                })
        else:
            logger.error("JSON file must be a list of records or have a 'video_ids' key.")
            sys.exit(1)
        logger.info(f"Loaded {len(video_records)} videos from {json_path.name}.")

    if not video_records:
        logger.warning("No videos to ingest.")
        print(json.dumps({"total": 0, "transcripts_fetched": 0, "qdrant_chunks": 0}))
        return

    # Run ingestion
    results = ingest_videos(
        video_records=video_records,
        use_qdrant=not args.no_qdrant,
        dry_run=args.dry_run,
    )

    # Print summary
    summary = {k: v for k, v in results.items() if k != "video_results"}
    print(json.dumps(summary, indent=2))

    logger.info(
        f"Ingestion complete: {results['transcripts_fetched']}/{results['total']} transcripts, "
        f"{results['qdrant_chunks']} Qdrant chunks, {results['kg_entities']} KG entities."
    )


if __name__ == "__main__":
    main()
