#!/usr/bin/env python3
"""
Nightly YouTube Transcript Ingestion Pipeline
================================================
Automated pipeline that runs nightly to:
1. Check YouTube RSS feeds for each channel in youtube_scout_channels.json
2. Find videos published in the last 24-48 hours
3. Fetch transcripts via Supadata.ai API
4. Save raw JSON + processed markdown to data/youtube_knowledge_base/
5. Update index.json with ingested videos
6. Log all results to overnight_logs/

Designed to run from WSL (where E: drive is accessible at /mnt/e/).
Can also run from any environment where GENESIS_ROOT is accessible.

Usage:
    # Normal nightly run (processes last 24h of videos)
    python3 scripts/nightly_youtube_ingest.py

    # Process last N hours (e.g., catch up after downtime)
    python3 scripts/nightly_youtube_ingest.py --hours 72

    # Dry run (check feeds, don't fetch transcripts)
    python3 scripts/nightly_youtube_ingest.py --dry-run

    # Process specific channel only
    python3 scripts/nightly_youtube_ingest.py --channel "IndyDevDan"

    # Force re-process already-ingested videos
    python3 scripts/nightly_youtube_ingest.py --force

Cron entry (2:00 AM AEST = 16:00 UTC previous day):
    0 16 * * * /bin/bash -c 'set -a; source /mnt/e/genesis-system/config/secrets.env; set +a; python3 /mnt/e/genesis-system/scripts/nightly_youtube_ingest.py >> /mnt/e/genesis-system/overnight_logs/youtube_ingest_$(date +\\%Y-\\%m-\\%d).log 2>&1'

Author: Genesis System
Version: 1.0.0
Date: 2026-02-26
"""

import argparse
import json
import logging
import os
import re
import sys
import time
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# ---------------------------------------------------------------------------
# Environment detection & path setup
# ---------------------------------------------------------------------------

def detect_genesis_root() -> Path:
    """Detect Genesis root directory based on environment."""
    candidates = [
        Path("/mnt/e/genesis-system"),        # WSL
        Path("E:/genesis-system"),             # Windows native
        Path(os.environ.get("GENESIS_ROOT", "")),  # Environment override
    ]
    for p in candidates:
        if p.exists() and (p / "CLAUDE.md").exists():
            return p
    # Fallback: relative to this script
    script_dir = Path(__file__).resolve().parent.parent
    if (script_dir / "CLAUDE.md").exists():
        return script_dir
    raise FileNotFoundError(
        "Cannot find Genesis root. Set GENESIS_ROOT env var or run from within the repo."
    )


GENESIS_ROOT = detect_genesis_root()
CHANNELS_FILE = GENESIS_ROOT / "data" / "youtube_scout_channels.json"
KB_DIR = GENESIS_ROOT / "data" / "youtube_knowledge_base"
RAW_DIR = KB_DIR / "raw"
PROCESSED_DIR = KB_DIR / "processed"
INDEX_FILE = KB_DIR / "index.json"
LOG_DIR = GENESIS_ROOT / "overnight_logs"
KG_ENTITIES_DIR = GENESIS_ROOT / "KNOWLEDGE_GRAPH" / "entities"

# Supadata API config
SUPADATA_API_URL = "https://api.supadata.ai/v1/transcript"

# YouTube RSS feed template
YT_RSS_TEMPLATE = "https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"

# ---------------------------------------------------------------------------
# Logging setup
# ---------------------------------------------------------------------------

LOG_DIR.mkdir(parents=True, exist_ok=True)
today_str = datetime.now().strftime("%Y-%m-%d")
log_file = LOG_DIR / f"youtube_ingest_{today_str}.log"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(str(log_file), mode="a", encoding="utf-8"),
    ],
)
logger = logging.getLogger("nightly_yt_ingest")


# ---------------------------------------------------------------------------
# Supadata API key loading
# ---------------------------------------------------------------------------

def load_supadata_api_key() -> str:
    """Load Supadata API key from environment or secrets.env file."""
    # Check multiple env var names
    for var_name in [
        "SUPADATA_API_KEY",
        "SUPADATA_YOUTUBE_TRANSCRIPT_API_KEY",
    ]:
        key = os.environ.get(var_name)
        if key:
            return key.strip().strip('"').strip("'")

    # Parse from secrets.env
    secrets_file = GENESIS_ROOT / "config" / "secrets.env"
    if secrets_file.exists():
        with open(secrets_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                for var_name in [
                    "SUPADATA_API_KEY",
                    "SUPADATA_YOUTUBE_TRANSCRIPT_API_KEY",
                ]:
                    if line.startswith(f"{var_name}="):
                        val = line.split("=", 1)[1].strip().strip('"').strip("'")
                        if val:
                            return val

    raise ValueError(
        "Supadata API key not found. Set SUPADATA_API_KEY or "
        "SUPADATA_YOUTUBE_TRANSCRIPT_API_KEY in environment or config/secrets.env"
    )


# ---------------------------------------------------------------------------
# HTTP helpers (no external deps beyond requests)
# ---------------------------------------------------------------------------

try:
    import requests
except ImportError:
    logger.error("'requests' library is required. Install: pip3 install requests")
    sys.exit(1)


def http_get_with_retry(
    url: str,
    headers: Optional[Dict[str, str]] = None,
    params: Optional[Dict[str, str]] = None,
    max_retries: int = 3,
    initial_backoff: float = 2.0,
) -> requests.Response:
    """HTTP GET with exponential backoff retry on 429/5xx errors."""
    backoff = initial_backoff
    last_exc = None

    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, headers=headers, params=params, timeout=30)

            if resp.status_code == 429:
                retry_after = int(resp.headers.get("Retry-After", backoff))
                logger.warning(
                    f"  Rate limited (429). Waiting {retry_after}s "
                    f"(attempt {attempt}/{max_retries})"
                )
                time.sleep(retry_after)
                backoff *= 2
                continue

            if resp.status_code >= 500:
                logger.warning(
                    f"  Server error ({resp.status_code}). Retrying in {backoff}s "
                    f"(attempt {attempt}/{max_retries})"
                )
                time.sleep(backoff)
                backoff *= 2
                continue

            return resp

        except requests.exceptions.RequestException as e:
            last_exc = e
            logger.warning(
                f"  Request error: {e}. Retrying in {backoff}s "
                f"(attempt {attempt}/{max_retries})"
            )
            time.sleep(backoff)
            backoff *= 2

    # Final attempt or raise
    if last_exc:
        raise last_exc
    return resp  # type: ignore


# ---------------------------------------------------------------------------
# YouTube handle -> channel ID resolver
# ---------------------------------------------------------------------------

# Cache file for resolved channel IDs (persists across runs)
CHANNEL_ID_CACHE_FILE = GENESIS_ROOT / "data" / "youtube_channel_id_cache.json"


def load_channel_id_cache() -> Dict[str, str]:
    """Load cached handle -> channel_id mappings."""
    if CHANNEL_ID_CACHE_FILE.exists():
        try:
            with open(CHANNEL_ID_CACHE_FILE, "r", encoding="utf-8") as f:
                return json.load(f)
        except (json.JSONDecodeError, IOError):
            pass
    return {}


def save_channel_id_cache(cache: Dict[str, str]) -> None:
    """Save handle -> channel_id cache to disk."""
    CHANNEL_ID_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(CHANNEL_ID_CACHE_FILE, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2)


def resolve_handle_to_channel_id(handle: str) -> Optional[str]:
    """
    Resolve a YouTube @handle to a channel ID by scraping the channel page.

    YouTube channel pages at youtube.com/@handle contain a canonical URL
    with the channel ID in a meta tag or link tag.
    """
    if not handle:
        return None

    # Normalize handle
    handle = handle.lstrip("@")
    url = f"https://www.youtube.com/@{handle}"

    try:
        resp = requests.get(url, timeout=15, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Accept-Language": "en-US,en;q=0.9",
        })

        if resp.status_code != 200:
            logger.warning(f"  Handle resolution failed for @{handle}: HTTP {resp.status_code}")
            return None

        # Look for channel ID in various patterns in the HTML
        html = resp.text

        # Pattern 1: "channelId":"UCxxxx"
        match = re.search(r'"channelId"\s*:\s*"(UC[a-zA-Z0-9_-]+)"', html)
        if match:
            return match.group(1)

        # Pattern 2: /channel/UCxxxx in canonical or alternate links
        match = re.search(r'/channel/(UC[a-zA-Z0-9_-]+)', html)
        if match:
            return match.group(1)

        # Pattern 3: "externalId":"UCxxxx"
        match = re.search(r'"externalId"\s*:\s*"(UC[a-zA-Z0-9_-]+)"', html)
        if match:
            return match.group(1)

        logger.warning(f"  Could not extract channel ID from @{handle} page")
        return None

    except Exception as e:
        logger.warning(f"  Handle resolution error for @{handle}: {e}")
        return None


def resolve_channel_ids(channels: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Resolve missing channel IDs from handles.
    Uses a persistent cache to avoid repeated lookups.
    Updates the channels list in-place and returns it.
    """
    cache = load_channel_id_cache()
    cache_updated = False
    resolved_count = 0

    for ch in channels:
        ch_id = ch.get("id", "")
        handle = ch.get("handle", "")
        name = ch.get("name", "Unknown")

        if ch_id:
            # Already has an ID, verify it's in cache too
            if handle and handle not in cache:
                cache[handle] = ch_id
                cache_updated = True
            continue

        if not handle:
            logger.warning(f"  {name}: No ID and no handle - cannot resolve")
            continue

        # Check cache first
        if handle in cache:
            ch["id"] = cache[handle]
            logger.info(f"  {name}: Resolved from cache -> {cache[handle]}")
            resolved_count += 1
            continue

        # Resolve from YouTube
        logger.info(f"  {name}: Resolving @{handle.lstrip('@')} ...")
        channel_id = resolve_handle_to_channel_id(handle)

        if channel_id:
            ch["id"] = channel_id
            cache[handle] = channel_id
            cache_updated = True
            resolved_count += 1
            logger.info(f"  {name}: Resolved -> {channel_id}")
            # Brief pause to be respectful
            time.sleep(0.5)
        else:
            logger.warning(f"  {name}: Could not resolve handle @{handle.lstrip('@')}")

    if cache_updated:
        save_channel_id_cache(cache)
        logger.info(f"  Channel ID cache updated ({len(cache)} entries)")

    if resolved_count > 0:
        logger.info(f"  Resolved {resolved_count} channel IDs from handles")

    return channels


# ---------------------------------------------------------------------------
# YouTube RSS feed parsing
# ---------------------------------------------------------------------------

# XML namespaces used in YouTube Atom feeds
YT_NS = {
    "atom": "http://www.w3.org/2005/Atom",
    "yt": "http://www.youtube.com/xml/schemas/2015",
    "media": "http://search.yahoo.com/mrss/",
}


def fetch_channel_rss(channel_id: str) -> List[Dict[str, Any]]:
    """
    Fetch recent videos from a YouTube channel's RSS feed.

    Returns list of dicts with: video_id, title, published, channel_name, link
    YouTube RSS feeds return the 15 most recent videos.
    """
    feed_url = YT_RSS_TEMPLATE.format(channel_id=channel_id)
    videos = []

    try:
        resp = http_get_with_retry(feed_url)
        if resp.status_code != 200:
            logger.warning(
                f"  RSS feed returned {resp.status_code} for channel {channel_id}"
            )
            return []

        root = ET.fromstring(resp.content)

        # Get channel name from feed title
        feed_title_el = root.find("atom:title", YT_NS)
        channel_name = feed_title_el.text if feed_title_el is not None else ""

        for entry in root.findall("atom:entry", YT_NS):
            video_id_el = entry.find("yt:videoId", YT_NS)
            title_el = entry.find("atom:title", YT_NS)
            published_el = entry.find("atom:published", YT_NS)
            link_el = entry.find("atom:link", YT_NS)

            if video_id_el is None:
                continue

            video_id = video_id_el.text
            title = title_el.text if title_el is not None else ""
            published_str = published_el.text if published_el is not None else ""
            link = link_el.get("href", "") if link_el is not None else ""

            # Parse published date
            published_dt = None
            if published_str:
                try:
                    # YouTube RSS uses ISO 8601 format
                    published_dt = datetime.fromisoformat(
                        published_str.replace("Z", "+00:00")
                    )
                except ValueError:
                    pass

            videos.append({
                "video_id": video_id,
                "title": title,
                "published": published_str,
                "published_dt": published_dt,
                "channel_name": channel_name,
                "link": link or f"https://www.youtube.com/watch?v={video_id}",
            })

    except ET.ParseError as e:
        logger.error(f"  XML parse error for channel {channel_id}: {e}")
    except Exception as e:
        logger.error(f"  Error fetching RSS for channel {channel_id}: {e}")

    return videos


def filter_recent_videos(
    videos: List[Dict[str, Any]],
    hours: int = 48,
) -> List[Dict[str, Any]]:
    """Filter videos to only those published within the last N hours."""
    cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
    recent = []

    for v in videos:
        pub_dt = v.get("published_dt")
        if pub_dt is None:
            # If we can't parse the date, include it (better to over-ingest)
            recent.append(v)
            continue

        # Ensure timezone aware
        if pub_dt.tzinfo is None:
            pub_dt = pub_dt.replace(tzinfo=timezone.utc)

        if pub_dt >= cutoff:
            recent.append(v)

    return recent


# ---------------------------------------------------------------------------
# Already-ingested tracking
# ---------------------------------------------------------------------------

def load_ingested_ids() -> set:
    """Load set of already-ingested video IDs from index.json and raw directory."""
    ingested = set()

    # From index.json
    if INDEX_FILE.exists():
        try:
            with open(INDEX_FILE, "r", encoding="utf-8") as f:
                data = json.load(f)
            for v in data.get("videos", []):
                vid = v.get("video_id")
                if vid:
                    ingested.add(vid)
        except (json.JSONDecodeError, KeyError):
            pass

    # From raw directory (file names are video_id.json)
    if RAW_DIR.exists():
        for p in RAW_DIR.iterdir():
            if p.suffix == ".json":
                ingested.add(p.stem)

    return ingested


# ---------------------------------------------------------------------------
# Supadata transcript fetching
# ---------------------------------------------------------------------------

def fetch_transcript_supadata(
    video_id: str,
    api_key: str,
) -> Optional[Dict[str, Any]]:
    """
    Fetch transcript for a video via Supadata API.

    Returns dict with: video_id, content/lang/title or None on failure.
    Handles async (202) responses by polling.
    """
    headers = {
        "x-api-key": api_key,
    }
    params = {
        "url": f"https://www.youtube.com/watch?v={video_id}",
        "mode": "auto",
        "text": "true",
    }

    try:
        resp = http_get_with_retry(
            SUPADATA_API_URL,
            headers=headers,
            params=params,
            max_retries=3,
            initial_backoff=2.0,
        )

        if resp.status_code == 202:
            # Async processing - poll for result
            data = resp.json()
            job_id = data.get("jobId")
            if job_id:
                return _poll_supadata_job(job_id, video_id, api_key)
            logger.warning(f"  Got 202 but no jobId for {video_id}")
            return None

        if resp.status_code == 200:
            data = resp.json()
            data["video_id"] = video_id
            data["fetched_at"] = datetime.now(timezone.utc).isoformat()
            return data

        if resp.status_code in (404, 206):
            logger.info(f"  No transcript available for {video_id} ({resp.status_code})")
            return None

        if resp.status_code == 403:
            logger.warning(f"  Access denied for {video_id} (403)")
            return None

        logger.warning(
            f"  Unexpected status {resp.status_code} for {video_id}: "
            f"{resp.text[:200]}"
        )
        return None

    except Exception as e:
        logger.error(f"  Supadata API error for {video_id}: {e}")
        return None


def _poll_supadata_job(
    job_id: str,
    video_id: str,
    api_key: str,
    max_attempts: int = 60,
) -> Optional[Dict[str, Any]]:
    """Poll Supadata async job until complete."""
    headers = {"x-api-key": api_key}
    poll_url = f"https://api.supadata.ai/v1/transcript/{job_id}"

    for attempt in range(max_attempts):
        time.sleep(1)
        try:
            resp = requests.get(poll_url, headers=headers, timeout=15)

            if resp.status_code == 200:
                data = resp.json()
                status = data.get("status")
                if status in ("queued", "active"):
                    continue
                if status == "failed":
                    logger.warning(f"  Async job failed for {video_id}")
                    return None
                # Completed
                data["video_id"] = video_id
                data["fetched_at"] = datetime.now(timezone.utc).isoformat()
                return data

            if resp.status_code == 202:
                continue

        except Exception as e:
            logger.warning(f"  Poll error for {video_id}: {e}")

    logger.warning(f"  Async job timed out for {video_id}")
    return None


# ---------------------------------------------------------------------------
# Output writers
# ---------------------------------------------------------------------------

def save_raw_json(video_id: str, data: Dict[str, Any]) -> Path:
    """Save raw API response to data/youtube_knowledge_base/raw/{video_id}.json"""
    RAW_DIR.mkdir(parents=True, exist_ok=True)
    out_path = RAW_DIR / f"{video_id}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False, default=str)
    return out_path


def save_processed_markdown(
    video_id: str,
    title: str,
    channel_name: str,
    transcript_text: str,
    source_url: str,
    published: str = "",
) -> Path:
    """Save processed transcript as markdown."""
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    out_path = PROCESSED_DIR / f"{video_id}.md"

    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
    md_content = f"""# {title}

**Channel**: {channel_name}
**URL**: {source_url}
**Video ID**: `{video_id}`
**Published**: {published}
**Fetched**: {now_str}

---

## Transcript

{transcript_text}
"""
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(md_content)
    return out_path


def extract_plain_text(data: Dict[str, Any]) -> str:
    """Extract plain text transcript from Supadata response."""
    # If 'content' is a string (text=true mode), return directly
    content = data.get("content", "")
    if isinstance(content, str) and content.strip():
        return content

    # If 'content' is a list of segments, join them
    if isinstance(content, list):
        return " ".join(
            seg.get("text", "") for seg in content if isinstance(seg, dict)
        )

    # Fallback: try 'transcript' or 'text' keys
    for key in ("transcript", "text"):
        val = data.get(key, "")
        if isinstance(val, str) and val.strip():
            return val

    return ""


def update_index(
    video_id: str,
    title: str,
    channel_name: str,
    word_count: int,
    topics: List[str],
) -> None:
    """Update index.json with newly ingested video."""
    INDEX_FILE.parent.mkdir(parents=True, exist_ok=True)

    if INDEX_FILE.exists():
        with open(INDEX_FILE, "r", encoding="utf-8") as f:
            index = json.load(f)
    else:
        index = {
            "created": today_str,
            "description": "Genesis YouTube Knowledge Base - AI Tool Learning",
            "videos": [],
            "topics": {},
            "stats": {"total_videos": 0, "total_words": 0, "last_updated": ""},
        }

    # Check if video already in index
    existing_ids = {v.get("video_id") for v in index.get("videos", [])}
    if video_id in existing_ids:
        return

    # Add video entry
    index["videos"].append({
        "video_id": video_id,
        "title": title,
        "channel": channel_name,
        "added": datetime.now(timezone.utc).isoformat(),
        "word_count": word_count,
        "topics": topics,
    })

    # Update topic index
    for topic in topics:
        if topic not in index.get("topics", {}):
            index["topics"][topic] = []
        if video_id not in index["topics"][topic]:
            index["topics"][topic].append(video_id)

    # Update stats
    index["stats"]["total_videos"] = len(index["videos"])
    index["stats"]["total_words"] = sum(
        v.get("word_count", 0) for v in index["videos"]
    )
    index["stats"]["last_updated"] = datetime.now(timezone.utc).isoformat()

    with open(INDEX_FILE, "w", encoding="utf-8") as f:
        json.dump(index, f, indent=2, ensure_ascii=False)


def write_kg_entity(
    video_id: str,
    title: str,
    channel_name: str,
    word_count: int,
    topics: List[str],
    source_url: str,
) -> None:
    """Write a KG entity JSONL entry for this video."""
    KG_ENTITIES_DIR.mkdir(parents=True, exist_ok=True)
    entity_file = KG_ENTITIES_DIR / f"youtube_nightly_ingest_{today_str}.jsonl"

    entity = {
        "id": f"yt_nightly_{video_id}_{int(time.time())}",
        "type": "youtube_video_nightly_ingest",
        "video_id": video_id,
        "title": title,
        "channel_name": channel_name,
        "video_url": source_url,
        "word_count": word_count,
        "topics": topics,
        "source": "nightly_rss_ingest",
        "ingested_at": datetime.now(timezone.utc).isoformat(),
        "tags": ["youtube", "nightly_ingest", channel_name.lower().replace(" ", "_")],
    }

    with open(entity_file, "a", encoding="utf-8") as f:
        f.write(json.dumps(entity, ensure_ascii=False) + "\n")


# ---------------------------------------------------------------------------
# Topic inference (simple keyword-based)
# ---------------------------------------------------------------------------

def infer_topics(title: str, transcript: str) -> List[str]:
    """Infer topic tags from title and transcript content."""
    combined = (title + " " + transcript[:2000]).lower()
    topic_keywords = {
        "claude_code": ["claude code", "claude-code", "claude agent", "anthropic cli"],
        "mcp": ["mcp", "model context protocol", "mcp server", "mcp tool"],
        "ai_agents": ["ai agent", "agentic", "autonomous agent", "multi-agent"],
        "voice_ai": ["voice ai", "voice agent", "telnyx", "vapi", "voice bot", "webrtc"],
        "memory_systems": ["memory system", "long-term memory", "mem0", "letta", "zep", "knowledge graph"],
        "ghl": ["gohighlevel", "highlevel", "ghl", "go high level"],
        "n8n": ["n8n", "workflow automation"],
        "cloudflare": ["cloudflare", "workers", "durable objects", "cloudflare workers"],
        "llm": ["llm", "large language model", "gpt", "gemini", "ollama", "fine-tun"],
        "coding": ["coding", "programming", "developer", "software", "code"],
        "automation": ["automation", "automate", "workflow"],
        "open_source": ["open source", "open-source", "github", "hugging face"],
        "ai_news": ["ai news", "ai update", "new release", "just released", "announced"],
        "tutorial": ["tutorial", "how to", "step by step", "walkthrough", "guide"],
        "browser_use": ["browser use", "browser automation", "playwright", "puppeteer", "selenium"],
        "rag": ["rag", "retrieval augmented", "vector database", "embedding"],
        "telnyx": ["telnyx", "sip", "telephony"],
        "openclaw": ["openclaw", "open claw"],
    }

    found_topics = []
    for topic, keywords in topic_keywords.items():
        for kw in keywords:
            if kw in combined:
                found_topics.append(topic)
                break

    return found_topics if found_topics else ["general_ai"]


# ---------------------------------------------------------------------------
# Main pipeline
# ---------------------------------------------------------------------------

def run_pipeline(
    hours: int = 48,
    dry_run: bool = False,
    channel_filter: Optional[str] = None,
    force: bool = False,
) -> Dict[str, Any]:
    """
    Main nightly ingestion pipeline.

    Args:
        hours: Look back this many hours for new videos (default 48 for safety)
        dry_run: If True, check feeds but don't fetch transcripts
        channel_filter: Only process this channel name (case-insensitive)
        force: Re-process even if video already ingested

    Returns:
        Summary dict with counts and details
    """
    logger.info("=" * 70)
    logger.info("NIGHTLY YOUTUBE TRANSCRIPT INGESTION PIPELINE")
    logger.info(f"  Genesis root: {GENESIS_ROOT}")
    logger.info(f"  Lookback: {hours} hours")
    logger.info(f"  Dry run: {dry_run}")
    logger.info(f"  Channel filter: {channel_filter or 'ALL'}")
    logger.info(f"  Force re-process: {force}")
    logger.info("=" * 70)

    # Load channel config
    if not CHANNELS_FILE.exists():
        logger.error(f"Channel config not found: {CHANNELS_FILE}")
        return {"error": "channels file not found"}

    with open(CHANNELS_FILE, "r", encoding="utf-8") as f:
        config = json.load(f)

    channels = config.get("channels", [])
    if channel_filter:
        channels = [
            c for c in channels
            if channel_filter.lower() in c.get("name", "").lower()
        ]
        if not channels:
            logger.warning(f"No channels match filter: {channel_filter}")
            return {"error": f"no channels match: {channel_filter}"}

    logger.info(f"Processing {len(channels)} channels")

    # Resolve missing channel IDs from handles
    logger.info("Resolving channel IDs from handles...")
    channels = resolve_channel_ids(channels)

    # Load already-ingested video IDs
    ingested_ids = set() if force else load_ingested_ids()
    logger.info(f"Already ingested: {len(ingested_ids)} videos")

    # Load API key
    api_key = None
    if not dry_run:
        try:
            api_key = load_supadata_api_key()
            logger.info("Supadata API key loaded")
        except ValueError as e:
            logger.error(str(e))
            return {"error": str(e)}

    # Results tracking
    results = {
        "start_time": datetime.now(timezone.utc).isoformat(),
        "channels_checked": 0,
        "new_videos_found": 0,
        "transcripts_fetched": 0,
        "transcripts_failed": 0,
        "transcripts_skipped": 0,
        "already_ingested": 0,
        "total_words": 0,
        "videos": [],
        "errors": [],
    }

    # Process each channel
    for channel in channels:
        ch_name = channel.get("name", "Unknown")
        ch_id = channel.get("id", "")
        ch_priority = channel.get("priority", "medium")

        if not ch_id:
            logger.warning(f"  Skipping {ch_name}: no channel ID")
            continue

        logger.info(f"\n--- Channel: {ch_name} (priority={ch_priority}) ---")
        results["channels_checked"] += 1

        # Fetch RSS feed
        rss_videos = fetch_channel_rss(ch_id)
        if not rss_videos:
            logger.info(f"  No videos in RSS feed for {ch_name}")
            continue

        logger.info(f"  RSS feed: {len(rss_videos)} videos total")

        # Filter to recent videos
        recent = filter_recent_videos(rss_videos, hours=hours)
        logger.info(f"  Recent (last {hours}h): {len(recent)} videos")

        if not recent:
            continue

        for video in recent:
            vid = video["video_id"]
            title = video.get("title", "")
            published = video.get("published", "")
            source_url = video.get("link", f"https://www.youtube.com/watch?v={vid}")

            # Skip already ingested
            if vid in ingested_ids:
                logger.info(f"  SKIP (already ingested): {vid} - {title[:60]}")
                results["already_ingested"] += 1
                continue

            results["new_videos_found"] += 1
            logger.info(f"  NEW: {vid} - {title[:80]}")

            if dry_run:
                results["videos"].append({
                    "video_id": vid,
                    "title": title,
                    "channel": ch_name,
                    "published": published,
                    "status": "dry_run",
                })
                continue

            # Fetch transcript via Supadata
            transcript_data = fetch_transcript_supadata(vid, api_key)

            if not transcript_data:
                logger.warning(f"  FAILED: No transcript for {vid}")
                results["transcripts_failed"] += 1
                results["videos"].append({
                    "video_id": vid,
                    "title": title,
                    "channel": ch_name,
                    "status": "no_transcript",
                })
                # Brief pause before next request
                time.sleep(0.5)
                continue

            # Extract plain text
            plain_text = extract_plain_text(transcript_data)
            if not plain_text.strip():
                logger.warning(f"  EMPTY: Transcript text is empty for {vid}")
                results["transcripts_failed"] += 1
                results["videos"].append({
                    "video_id": vid,
                    "title": title,
                    "channel": ch_name,
                    "status": "empty_transcript",
                })
                time.sleep(0.5)
                continue

            word_count = len(plain_text.split())
            logger.info(f"  TRANSCRIPT: {word_count} words")
            results["transcripts_fetched"] += 1
            results["total_words"] += word_count

            # Get title from transcript data if not in RSS
            if not title:
                title = transcript_data.get("title", vid)

            # Inject extra metadata into raw data
            transcript_data["channel_name"] = ch_name
            transcript_data["channel_id"] = ch_id
            transcript_data["published"] = published
            transcript_data["priority"] = ch_priority
            transcript_data["source_url"] = source_url
            transcript_data["title"] = title

            # Save raw JSON
            raw_path = save_raw_json(vid, transcript_data)
            logger.info(f"  SAVED raw: {raw_path.name}")

            # Save processed markdown
            md_path = save_processed_markdown(
                vid, title, ch_name, plain_text, source_url, published
            )
            logger.info(f"  SAVED processed: {md_path.name}")

            # Infer topics
            topics = infer_topics(title, plain_text)

            # Update index
            update_index(vid, title, ch_name, word_count, topics)

            # Write KG entity
            write_kg_entity(vid, title, ch_name, word_count, topics, source_url)

            results["videos"].append({
                "video_id": vid,
                "title": title,
                "channel": ch_name,
                "published": published,
                "word_count": word_count,
                "topics": topics,
                "status": "success",
            })

            # Mark as ingested for this session
            ingested_ids.add(vid)

            # Rate limiting between API calls
            time.sleep(1.0)

    # Final summary
    results["end_time"] = datetime.now(timezone.utc).isoformat()
    results["dry_run"] = dry_run

    logger.info("\n" + "=" * 70)
    logger.info("PIPELINE SUMMARY")
    logger.info(f"  Channels checked:    {results['channels_checked']}")
    logger.info(f"  New videos found:    {results['new_videos_found']}")
    logger.info(f"  Already ingested:    {results['already_ingested']}")
    logger.info(f"  Transcripts fetched: {results['transcripts_fetched']}")
    logger.info(f"  Transcripts failed:  {results['transcripts_failed']}")
    logger.info(f"  Total words:         {results['total_words']}")
    logger.info(f"  Start: {results['start_time']}")
    logger.info(f"  End:   {results['end_time']}")
    logger.info("=" * 70)

    return results


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Nightly YouTube transcript ingestion pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Normal nightly run
    python3 nightly_youtube_ingest.py

    # Catch up on last 3 days
    python3 nightly_youtube_ingest.py --hours 72

    # Dry run (check feeds only)
    python3 nightly_youtube_ingest.py --dry-run

    # Process specific channel
    python3 nightly_youtube_ingest.py --channel "IndyDevDan"
        """,
    )

    parser.add_argument(
        "--hours",
        type=int,
        default=48,
        help="Look back this many hours for new videos (default: 48)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Check feeds but don't fetch transcripts",
    )
    parser.add_argument(
        "--channel",
        type=str,
        default=None,
        help="Only process channels matching this name (case-insensitive)",
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Re-process even if video already ingested",
    )

    args = parser.parse_args()

    results = run_pipeline(
        hours=args.hours,
        dry_run=args.dry_run,
        channel_filter=args.channel,
        force=args.force,
    )

    # Print JSON summary for machine consumption
    print("\n=== PIPELINE RESULTS (JSON) ===")
    summary = {k: v for k, v in results.items() if k != "videos"}
    summary["video_count"] = len(results.get("videos", []))
    print(json.dumps(summary, indent=2))


if __name__ == "__main__":
    main()
