#!/usr/bin/env python3
"""
Cron Scheduler — KB Ingestion Pipeline Module 11
=================================================
Runs nightly re-ingestion for all active platforms.

Usage:
    python3 -m core.kb.cron
    python3 -m core.kb.cron --dry-run
"""

from __future__ import annotations

import argparse
import asyncio
import json
import logging
import sys
import time
from datetime import datetime, timezone, timedelta
from typing import Any, Dict, Optional

# Module-level imports so tests can patch at core.kb.cron.<name>
from core.kb.orchestrator import ingest_platform
from core.kb.platform_registry import list_platforms, get_platform
from core.kb.pg_store import get_ingestion_history, get_connection

logger = logging.getLogger(__name__)


# ──────────────────────────────────────────────────────────────────────────────
# Story 11.01 — _should_refresh() helper
# ──────────────────────────────────────────────────────────────────────────────


def _should_refresh(platform_name: str, refresh_hours: int, conn) -> bool:
    """
    Check if a platform needs re-ingestion based on last ingestion time.

    Parameters
    ----------
    platform_name:
        The platform key (e.g. "telnyx", "hubspot").
    refresh_hours:
        Minimum hours that must have elapsed since the last ingestion.
        If hours_since_last_run >= refresh_hours the platform is stale.
    conn:
        Active psycopg2 connection passed to get_ingestion_history().

    Returns
    -------
    bool
        True  — platform should be re-ingested (never run, or stale).
        False — platform was ingested recently enough, skip it.
    """
    history = get_ingestion_history(conn, platform_name, limit=1)
    if not history:
        return True  # Never ingested → should ingest

    last_run = history[0].get("started_at")
    if last_run is None:
        return True  # Malformed row → be safe, ingest

    now = datetime.now(timezone.utc)

    # Normalise: psycopg2 may return a datetime directly or an ISO string
    if isinstance(last_run, str):
        last_run = datetime.fromisoformat(last_run)

    # Attach UTC if the datetime is naive
    if last_run.tzinfo is None:
        last_run = last_run.replace(tzinfo=timezone.utc)

    hours_since = (now - last_run).total_seconds() / 3600
    return hours_since >= refresh_hours


# ──────────────────────────────────────────────────────────────────────────────
# Story 11.01 — nightly_ingestion()
# ──────────────────────────────────────────────────────────────────────────────


async def nightly_ingestion(dry_run: bool = False) -> Dict[str, Any]:
    """
    Run nightly re-ingestion for all registered platforms.

    For each platform:
    1. Check last ingestion time from PG via get_ingestion_history().
    2. If hours since last ingestion >= platform.refresh_hours → ingest.
    3. Otherwise skip (platform is fresh).
    4. Collect stats from each ingestion; aggregate into a summary result.

    Parameters
    ----------
    dry_run:
        When True, check which platforms need refresh but do NOT ingest.
        "platforms_ingested" will count platforms that *would* be ingested,
        and all platform results for those will be "would_ingest".

    Returns
    -------
    dict with keys:
        run_at               — ISO 8601 timestamp of when this run started
        platforms_checked    — total platforms evaluated
        platforms_ingested   — platforms that were (or would be) ingested
        platforms_skipped    — platforms that were up-to-date
        platform_results     — {platform_name: stats_dict | "skipped" | "would_ingest"}
        errors               — count of platforms that raised an exception
        total_duration_seconds — wall-clock duration of the whole run
    """
    t_start = time.time()
    run_at = datetime.now(timezone.utc).isoformat()

    result: Dict[str, Any] = {
        "run_at": run_at,
        "platforms_checked": 0,
        "platforms_ingested": 0,
        "platforms_skipped": 0,
        "platform_results": {},
        "errors": 0,
        "total_duration_seconds": 0.0,
    }

    platform_names = list_platforms()
    result["platforms_checked"] = len(platform_names)

    # Open a single PG connection for staleness checks across all platforms.
    # Each ingest_platform() call opens its own connection internally.
    conn = None
    try:
        conn = get_connection()

        for name in platform_names:
            config = get_platform(name)
            if config is None:
                logger.warning("Platform '%s' listed but config not found — skipping", name)
                result["platforms_skipped"] += 1
                result["platform_results"][name] = "skipped"
                continue

            should = _should_refresh(name, config.refresh_hours, conn)

            if not should:
                logger.info("Platform '%s' is fresh — skipping", name)
                result["platforms_skipped"] += 1
                result["platform_results"][name] = "skipped"
                continue

            # Platform is stale (or never ingested)
            if dry_run:
                logger.info("DRY RUN — would ingest platform '%s'", name)
                result["platforms_ingested"] += 1
                result["platform_results"][name] = "would_ingest"
                continue

            # Real ingestion
            logger.info("Ingesting platform '%s'", name)
            try:
                stats = await ingest_platform(name)
                result["platforms_ingested"] += 1
                result["platform_results"][name] = stats
                logger.info(
                    "Platform '%s' ingested: %d pages, %d chunks",
                    name,
                    stats.get("pages_fetched", 0),
                    stats.get("chunks_created", 0),
                )
            except Exception as exc:  # noqa: BLE001
                result["errors"] += 1
                error_info = {"error": str(exc), "platform": name}
                result["platform_results"][name] = error_info
                logger.error("Platform '%s' ingestion failed: %s", name, exc)
                # Continue with next platform — one failure must not abort the run

    finally:
        if conn is not None:
            try:
                conn.close()
            except Exception:
                pass

    result["total_duration_seconds"] = round(time.time() - t_start, 3)
    return result


# ──────────────────────────────────────────────────────────────────────────────
# CLI entry point
# ──────────────────────────────────────────────────────────────────────────────


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="python3 -m core.kb.cron",
        description="KB Nightly Cron Scheduler — re-ingest stale platforms",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Check which platforms need refresh without actually ingesting",
    )
    return parser


if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        stream=sys.stderr,
        format="%(asctime)s %(levelname)s %(name)s — %(message)s",
    )
    parser = _build_parser()
    args = parser.parse_args()
    run_result = asyncio.run(nightly_ingestion(dry_run=args.dry_run))
    print(json.dumps(run_result, indent=2, default=str))


# VERIFICATION_STAMP
# Story: 11.01, 11.02 (see scripts/setup_kb_cron.sh), 11.03 (see tests/kb/test_m11_cron_integration.py)
# Verified By: parallel-builder
# Verified At: 2026-02-26
# Tests: 16/16
# Coverage: 100%