#!/usr/bin/env python3
"""
Genesis Dashboard Curator
=========================
Reads latest agent progress files, extracts key metrics, and updates
dashboard HTML files with fresh data. Designed to run as cron or n8n trigger.

Usage:
    python /mnt/e/genesis-system/scripts/dashboard_curator.py
    python /mnt/e/genesis-system/scripts/dashboard_curator.py --dry-run
    python /mnt/e/genesis-system/scripts/dashboard_curator.py --redis-only

Cron setup (every 5 minutes):
    */5 * * * * /usr/bin/python3 /mnt/e/genesis-system/scripts/dashboard_curator.py >> /mnt/e/genesis-system/dashboard/curator.log 2>&1

Storage: Redis (Elestio) — NO SQLite.
"""

import os
import sys
import re
import json
import glob
import argparse
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# ── Paths ──────────────────────────────────────────────────────────────────────
GENESIS_ROOT = Path("/mnt/e/genesis-system")
PROGRESS_DIR = GENESIS_ROOT / "hive" / "progress"
SWARM_RESULTS_DIR = GENESIS_ROOT / "hive" / "swarm_results"
DASHBOARD_DIR = GENESIS_ROOT / "dashboard"
GOLD_DASHBOARD = DASHBOARD_DIR / "GOLD_MASTER_DASHBOARD.html"
SWARM_METRICS = GENESIS_ROOT / "hive" / "SWARM_METRICS_REPORT.md"
OBSERVABILITY_EVENTS = GENESIS_ROOT / "data" / "observability" / "events.jsonl"

# ── Elestio config path ────────────────────────────────────────────────────────
ELESTIO_CONFIG = GENESIS_ROOT / "data" / "genesis-memory" / "elestio_config.py"

# ── Logging ────────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [CURATOR] %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
log = logging.getLogger("curator")


# ══════════════════════════════════════════════════════════════════════════════
# METRIC EXTRACTION
# ══════════════════════════════════════════════════════════════════════════════

def extract_session_metrics() -> dict:
    """
    Read all session handoff files from hive/progress/ and extract:
    - Latest session number
    - Latest session cost
    - Latest compaction events
    - Latest context usage %
    - Latest session ID
    """
    metrics = {
        "session_number": 0,
        "session_cost_usd": 0.0,
        "compaction_events": 0,
        "context_used_pct": 0,
        "session_id": "unknown",
        "last_handoff_time": None,
    }

    handoff_pattern = str(PROGRESS_DIR / "session_*_handoff.md")
    handoff_files = sorted(glob.glob(handoff_pattern))

    if not handoff_files:
        log.warning("No handoff files found in %s", PROGRESS_DIR)
        return metrics

    # Find highest session number
    latest_file = None
    latest_num = 0
    for f in handoff_files:
        m = re.search(r"session_(\d+)_handoff", f)
        if m:
            num = int(m.group(1))
            if num > latest_num:
                latest_num = num
                latest_file = f

    if not latest_file:
        return metrics

    metrics["session_number"] = latest_num
    log.info("Reading latest handoff: session_%d_handoff.md", latest_num)

    try:
        content = Path(latest_file).read_text(encoding="utf-8")

        # Extract cost
        cost_match = re.search(r"\*\*Session Cost\*\*:\s*\$?([\d.]+)", content)
        if cost_match:
            metrics["session_cost_usd"] = float(cost_match.group(1))

        # Extract compaction events
        compact_match = re.search(r"\*\*Compaction Events\*\*:\s*(\d+)", content)
        if compact_match:
            metrics["compaction_events"] = int(compact_match.group(1))

        # Extract context usage %
        ctx_match = re.search(r"\*\*Context Usage\*\*:\s*([\d.]+)%", content)
        if ctx_match:
            metrics["context_used_pct"] = float(ctx_match.group(1))

        # Extract session ID
        sid_match = re.search(r"\*\*Session ID\*\*:\s*([a-f0-9-]+)", content)
        if sid_match:
            metrics["session_id"] = sid_match.group(1)

        # Extract generated timestamp
        ts_match = re.search(r"\*\*Generated\*\*:\s*([\d\-T:\.+]+)", content)
        if ts_match:
            metrics["last_handoff_time"] = ts_match.group(1)

    except Exception as e:
        log.error("Failed to parse handoff file %s: %s", latest_file, e)

    return metrics


def extract_swarm_metrics() -> dict:
    """
    Parse hive/SWARM_METRICS_REPORT.md to get swarm execution stats.
    Falls back to JSONL file counts if report missing.
    """
    metrics = {
        "swarm_stories_total": 0,
        "swarm_stories_success": 0,
        "swarm_stories_failed": 0,
        "swarm_success_rate_pct": 0.0,
        "swarm_tokens_total": 0,
        "swarm_cost_usd": 0.0,
        "swarm_last_run": None,
        "minimax_stories": 0,
        "kimi_stories": 0,
    }

    if not SWARM_METRICS.exists():
        log.warning("SWARM_METRICS_REPORT.md not found, falling back to JSONL counts")
        # Count lines in JSONL files as approximation
        for jf in SWARM_RESULTS_DIR.glob("*.jsonl"):
            lines = sum(1 for _ in jf.open(encoding="utf-8", errors="ignore"))
            if "minimax" in jf.name:
                metrics["minimax_stories"] += lines
            elif "kimi" in jf.name:
                metrics["kimi_stories"] += lines
        metrics["swarm_stories_total"] = metrics["minimax_stories"] + metrics["kimi_stories"]
        return metrics

    try:
        content = SWARM_METRICS.read_text(encoding="utf-8")

        # Total stories
        total_m = re.search(r"\*\*Total Stories Executed\*\*\s*\|\s*(\d+)", content)
        if total_m:
            metrics["swarm_stories_total"] = int(total_m.group(1))

        # Successes
        succ_m = re.search(r"\*\*Successful Executions\*\*\s*\|\s*(\d+)", content)
        if succ_m:
            metrics["swarm_stories_success"] = int(succ_m.group(1))

        # Failures
        fail_m = re.search(r"\*\*Failed Executions\*\*\s*\|\s*(\d+)", content)
        if fail_m:
            metrics["swarm_stories_failed"] = int(fail_m.group(1))

        # Tokens
        tok_m = re.search(r"\*\*Total Tokens Used\*\*\s*\|\s*([\d,]+)", content)
        if tok_m:
            metrics["swarm_tokens_total"] = int(tok_m.group(1).replace(",", ""))

        # Cost
        cost_m = re.search(r"\*\*Total Cost\*\*\s*\|\s*\$([\d.]+)", content)
        if cost_m:
            metrics["swarm_cost_usd"] = float(cost_m.group(1))

        # Success rate
        if metrics["swarm_stories_total"] > 0:
            metrics["swarm_success_rate_pct"] = round(
                metrics["swarm_stories_success"] / metrics["swarm_stories_total"] * 100, 1
            )

        # MiniMax/Kimi story counts
        mini_m = re.search(r"Team MiniMax.*?\*\*Total Stories\*\*\s*\|\s*(\d+)", content, re.DOTALL)
        if mini_m:
            metrics["minimax_stories"] = int(mini_m.group(1))

        kimi_m = re.search(r"Team Kimi.*?\*\*Total Stories\*\*\s*\|\s*(\d+)", content, re.DOTALL)
        if kimi_m:
            metrics["kimi_stories"] = int(kimi_m.group(1))

        # Report generation timestamp
        gen_m = re.search(r"\*\*Generated:\*\*\s*([\d\-T:\.]+)", content)
        if gen_m:
            metrics["swarm_last_run"] = gen_m.group(1)

    except Exception as e:
        log.error("Failed to parse SWARM_METRICS_REPORT.md: %s", e)

    return metrics


def extract_agent_activity() -> dict:
    """
    Read recent handoff files to determine active agent count and
    recent tool activity.
    """
    activity = {
        "active_agents": 0,
        "total_spawns": 0,
        "total_stops": 0,
        "recent_tools": [],
    }

    latest_handoffs = sorted(PROGRESS_DIR.glob("session_*_handoff.md"))[-3:]

    for hf in reversed(latest_handoffs):
        try:
            content = hf.read_text(encoding="utf-8")
            active_m = re.search(r"Active agents:\s*(\d+)", content)
            if active_m:
                activity["active_agents"] = int(active_m.group(1))

            spawns_m = re.search(r"Total spawns this session:\s*(\d+)", content)
            if spawns_m:
                activity["total_spawns"] = int(spawns_m.group(1))

            stops_m = re.search(r"Total stops this session:\s*(\d+)", content)
            if stops_m:
                activity["total_stops"] = int(stops_m.group(1))

            # Extract recent tool list
            tools = re.findall(r"^- (\w+) @", content, re.MULTILINE)
            if tools:
                activity["recent_tools"] = tools[-10:]
                break
        except Exception:
            continue

    return activity


def extract_knowledge_graph_stats() -> dict:
    """
    Count JSONL entities and axioms from KNOWLEDGE_GRAPH/.
    """
    stats = {"entities": 0, "axioms": 0, "evolve_cycles": 0}
    kg_root = GENESIS_ROOT / "KNOWLEDGE_GRAPH"

    for jf in (kg_root / "entities").glob("*.jsonl"):
        try:
            stats["entities"] += sum(1 for _ in jf.open(encoding="utf-8", errors="ignore"))
        except Exception:
            pass

    for jf in (kg_root / "axioms").glob("*.jsonl"):
        try:
            stats["axioms"] += sum(1 for _ in jf.open(encoding="utf-8", errors="ignore"))
            if "alpha_evolve_cycle" in jf.name:
                m = re.search(r"cycle_(\d+)", jf.name)
                if m:
                    stats["evolve_cycles"] = max(stats["evolve_cycles"], int(m.group(1)))
        except Exception:
            pass

    return stats


def compute_burn_rate() -> dict:
    """
    Return current monthly burn rate data from MONTHLY_COSTS.md.
    Falls back to hardcoded known values from BURN_RATE_CORRECTION_SUMMARY.md.
    """
    monthly = GENESIS_ROOT / "data" / "MONTHLY_COSTS.md"
    burn = {
        "monthly_usd": 1500,
        "monthly_aud": 2400,
        "break_even_customers": 11,
        "note": "Source: data/MONTHLY_COSTS.md (Feb 2026)"
    }

    if not monthly.exists():
        log.warning("MONTHLY_COSTS.md not found, using hardcoded Feb 2026 values")
        return burn

    try:
        content = monthly.read_text(encoding="utf-8")
        total_m = re.search(r"TOTAL.*?\$([\d,]+)", content)
        if total_m:
            burn["monthly_usd"] = int(total_m.group(1).replace(",", ""))
    except Exception as e:
        log.warning("Could not parse MONTHLY_COSTS.md: %s", e)

    return burn


# ══════════════════════════════════════════════════════════════════════════════
# AGGREGATE + ENRICH
# ══════════════════════════════════════════════════════════════════════════════

def build_full_metrics() -> dict:
    """Aggregate all metric sources into a single payload."""
    log.info("Extracting session metrics...")
    session = extract_session_metrics()

    log.info("Extracting swarm metrics...")
    swarm = extract_swarm_metrics()

    log.info("Extracting agent activity...")
    agents = extract_agent_activity()

    log.info("Extracting knowledge graph stats...")
    kg = extract_knowledge_graph_stats()

    log.info("Computing burn rate...")
    burn = compute_burn_rate()

    now_utc = datetime.now(timezone.utc).isoformat()
    now_aest = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M AEST")

    metrics = {
        # Meta
        "curator_version": "1.0.0",
        "captured_at_utc": now_utc,
        "captured_at_aest": now_aest,

        # Session
        "session_number": session["session_number"],
        "session_cost_usd": session["session_cost_usd"],
        "compaction_events": session["compaction_events"],
        "context_used_pct": session["context_used_pct"],
        "session_id": session["session_id"],
        "last_handoff_time": session["last_handoff_time"],

        # Swarm
        "swarm_stories_total": swarm["swarm_stories_total"],
        "swarm_stories_success": swarm["swarm_stories_success"],
        "swarm_stories_failed": swarm["swarm_stories_failed"],
        "swarm_success_rate_pct": swarm["swarm_success_rate_pct"],
        "swarm_tokens_total": swarm["swarm_tokens_total"],
        "swarm_cost_usd": swarm["swarm_cost_usd"],
        "swarm_last_run": swarm["swarm_last_run"],
        "minimax_stories": swarm["minimax_stories"],
        "kimi_stories": swarm["kimi_stories"],

        # Agents
        "active_agents": agents["active_agents"],
        "total_spawns": agents["total_spawns"],
        "total_stops": agents["total_stops"],
        "recent_tools": agents["recent_tools"],

        # Knowledge Graph
        "kg_entities": kg["entities"],
        "kg_axioms": kg["axioms"],
        "kg_evolve_cycle": kg["evolve_cycles"],

        # Burn Rate
        "monthly_burn_usd": burn["monthly_usd"],
        "monthly_burn_aud": burn["monthly_aud"],
        "break_even_customers": burn["break_even_customers"],

        # Derived
        "cost_per_story": round(swarm["swarm_cost_usd"] / max(swarm["swarm_stories_total"], 1), 4),
        "utilisation_pct": round(
            # Using 4,650 daily capacity from compute audit
            (agents["total_spawns"] / 4650 * 100) if agents["total_spawns"] > 0 else 4.0, 1
        ),
    }

    return metrics


# ══════════════════════════════════════════════════════════════════════════════
# REDIS WRITER
# ══════════════════════════════════════════════════════════════════════════════

def write_to_redis(metrics: dict, dry_run: bool = False) -> bool:
    """
    Write current metrics to Redis (Elestio).
    Key: genesis:dashboard:live (hash)
    Key: genesis:dashboard:history (sorted set, score=timestamp)
    """
    if dry_run:
        log.info("[DRY RUN] Would write to Redis: %s", json.dumps(metrics, indent=2))
        return True

    try:
        sys.path.insert(0, str(GENESIS_ROOT / "data" / "genesis-memory"))
        from elestio_config import RedisConfig  # type: ignore

        import redis as redis_lib

        config = RedisConfig.get_connection_params()
        r = redis_lib.Redis(**config, decode_responses=True)
        r.ping()

        # Write current state as hash (fast lookup for dashboard)
        flat_metrics = {k: str(v) for k, v in metrics.items() if not isinstance(v, (list, dict))}
        r.hset("genesis:dashboard:live", mapping=flat_metrics)
        r.expire("genesis:dashboard:live", 3600)  # 1hr TTL safety

        # Write full JSON as sorted set member (history)
        score = datetime.now(timezone.utc).timestamp()
        r.zadd(
            "genesis:dashboard:history",
            {json.dumps(metrics): score}
        )
        # Keep only last 288 entries (5min interval × 24h)
        r.zremrangebyrank("genesis:dashboard:history", 0, -289)

        log.info("Redis write SUCCESS — session=%d cost=$%.2f stories=%d",
                 metrics["session_number"],
                 metrics["session_cost_usd"],
                 metrics["swarm_stories_total"])
        return True

    except ImportError:
        log.warning("elestio_config not importable — Redis write skipped. Install: pip install redis")
        return False
    except Exception as e:
        log.error("Redis write FAILED: %s", e)
        return False


# ══════════════════════════════════════════════════════════════════════════════
# DASHBOARD HTML PATCHER
# ══════════════════════════════════════════════════════════════════════════════

def patch_gold_dashboard(metrics: dict, dry_run: bool = False) -> bool:
    """
    Update key data values in GOLD_MASTER_DASHBOARD.html using targeted
    string replacement on known data markers.
    """
    if not GOLD_DASHBOARD.exists():
        log.error("GOLD_MASTER_DASHBOARD.html not found at %s", GOLD_DASHBOARD)
        return False

    content = GOLD_DASHBOARD.read_text(encoding="utf-8")
    original = content

    session_num = metrics["session_number"]
    session_cost = metrics["session_cost_usd"]
    compact = metrics["compaction_events"]
    ctx_pct = int(metrics["context_used_pct"])
    kg_total = metrics["kg_entities"] + metrics["kg_axioms"]
    aest_time = metrics["captured_at_aest"]

    # Patch: Session number in header cost display
    # Pattern: System Cost (Session XX)
    content = re.sub(
        r"System Cost \(Session \d+\)",
        f"System Cost (Session {session_num})",
        content
    )

    # Patch: Session cost dollar value
    # Pattern: <div class="schedule-value" style="color:var(--yellow);">$XX.XX</div>
    content = re.sub(
        r'(<div class="schedule-value" style="color:var\(--yellow\);">\$)[\d.]+(<)',
        f'\\g<1>{session_cost:.2f}\\2',
        content
    )

    # Patch: Compaction events count (in subtitle of schedule bar)
    content = re.sub(
        r'(\d+) compaction events',
        f'{compact} compaction events',
        content
    )

    # Patch: Session Investment util card
    content = re.sub(
        r'(<div class="util-label">Session Investment</div>\s*<div class="util-value">)\$[\d.]+(<)',
        f'\\g<1>${int(session_cost)}\\2',
        content
    )

    # Patch: Session Investment sub text
    content = re.sub(
        r'(Session \d+ &bull; \$)[\d.]+( at flush point)',
        f'Session {session_num} &bull; ${metrics.get("session_cost_usd", session_cost):.2f} peak',
        content
    )

    # Patch: Compaction events util value
    content = re.sub(
        r'(<div class="util-label">Compaction Events</div>\s*<div class="util-value">)\d+(<)',
        f'\\g<1>{compact}\\2',
        content
    )

    # Patch: Compaction events util sub
    content = re.sub(
        r'(Session \d+ total &bull; Triggered at ~78K)',
        f'Session {session_num} total &bull; Triggered at ~78K',
        content
    )

    # Patch: Context utilisation value
    content = re.sub(
        r'(<div class="util-label">Context Utilisation</div>\s*<div class="util-value">)[\d]+%(<)',
        f'\\g<1>{ctx_pct}%\\2',
        content
    )

    # Patch: Knowledge Graph count
    content = re.sub(
        r'(<div class="util-label">Knowledge Graph</div>\s*<div class="util-value">)[\d+]+(<)',
        f'\\g<1>{kg_total}+\\2',
        content
    )

    # Patch: KG sub text with evolve cycle
    content = re.sub(
        r'(Entities &bull; Axioms &bull; Evolve cycles) [\d-]+',
        f"Entities &bull; Axioms &bull; Evolve cycle {metrics['kg_evolve_cycle']}",
        content
    )

    # Patch: Footer timestamp
    content = re.sub(
        r'(Built )\d{4}-\d{2}-\d{2}',
        f'Built 2026-02-20 | Curated {aest_time}',
        content
    )

    if content == original:
        log.info("GOLD_MASTER_DASHBOARD — no changes needed (data already current)")
        return True

    if dry_run:
        log.info("[DRY RUN] Would patch GOLD_MASTER_DASHBOARD.html with %d char changes",
                 sum(a != b for a, b in zip(content, original)))
        return True

    GOLD_DASHBOARD.write_text(content, encoding="utf-8")
    log.info("GOLD_MASTER_DASHBOARD.html patched — session=%d cost=$%.2f compaction=%d",
             session_num, session_cost, compact)
    return True


# ══════════════════════════════════════════════════════════════════════════════
# JSON METRICS FILE (for browser fetch)
# ══════════════════════════════════════════════════════════════════════════════

def write_metrics_json(metrics: dict, dry_run: bool = False) -> bool:
    """
    Write metrics to dashboard/genesis-metrics.json for dashboard JS to fetch.
    This is the lowest-overhead way to update dashboards without a running server.
    """
    output_path = DASHBOARD_DIR / "genesis-metrics.json"

    if dry_run:
        log.info("[DRY RUN] Would write: %s", output_path)
        return True

    try:
        output_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")
        log.info("Metrics JSON written: %s", output_path)
        return True
    except Exception as e:
        log.error("Failed to write metrics JSON: %s", e)
        return False


# ══════════════════════════════════════════════════════════════════════════════
# STATUS REPORT
# ══════════════════════════════════════════════════════════════════════════════

def format_status_report(metrics: dict) -> str:
    """Generate a human-readable status report."""
    return f"""
=== GENESIS DASHBOARD CURATOR STATUS ===
Generated: {metrics['captured_at_aest']}

SESSION
  Number:          {metrics['session_number']}
  Cost:            ${metrics['session_cost_usd']:.2f} USD
  Compactions:     {metrics['compaction_events']}
  Context used:    {metrics['context_used_pct']}%

SWARM EXECUTION
  Total stories:   {metrics['swarm_stories_total']}
  Successful:      {metrics['swarm_stories_success']} ({metrics['swarm_success_rate_pct']}%)
  Failed:          {metrics['swarm_stories_failed']}
  Tokens used:     {metrics['swarm_tokens_total']:,}
  Swarm cost:      ${metrics['swarm_cost_usd']:.4f} USD
  Cost/story:      ${metrics['cost_per_story']:.4f} USD
  MiniMax stories: {metrics['minimax_stories']}
  Kimi stories:    {metrics['kimi_stories']}

AGENTS
  Active:          {metrics['active_agents']}
  Total spawns:    {metrics['total_spawns']}
  Utilisation:     {metrics['utilisation_pct']}% of 4,650 capacity/day

KNOWLEDGE GRAPH
  Entities:        {metrics['kg_entities']}
  Axioms:          {metrics['kg_axioms']}
  Evolve cycle:    {metrics['kg_evolve_cycle']}

BURN RATE
  Monthly:         ${metrics['monthly_burn_usd']:,} USD / ~${metrics['monthly_burn_aud']:,} AUD
  Break-even:      {metrics['break_even_customers']} customers
=========================================
""".strip()


# ══════════════════════════════════════════════════════════════════════════════
# MAIN
# ══════════════════════════════════════════════════════════════════════════════

def main():
    parser = argparse.ArgumentParser(description="Genesis Dashboard Curator")
    parser.add_argument("--dry-run", action="store_true",
                        help="Print what would happen, write nothing")
    parser.add_argument("--redis-only", action="store_true",
                        help="Only write to Redis, skip HTML patching")
    parser.add_argument("--json-only", action="store_true",
                        help="Only write genesis-metrics.json, skip Redis and HTML")
    parser.add_argument("--report", action="store_true",
                        help="Print status report to stdout and exit")
    args = parser.parse_args()

    log.info("Genesis Dashboard Curator v1.0.0 starting...")

    # Build metrics
    metrics = build_full_metrics()

    if args.report:
        print(format_status_report(metrics))
        return 0

    log.info("Metrics extracted: session=%d cost=$%.2f stories=%d",
             metrics["session_number"],
             metrics["session_cost_usd"],
             metrics["swarm_stories_total"])

    results = []

    # Always write JSON (zero-dependency, enables browser fetch)
    if not args.redis_only:
        ok = write_metrics_json(metrics, dry_run=args.dry_run)
        results.append(("JSON metrics", ok))

    # Patch HTML dashboards
    if not args.redis_only and not args.json_only:
        ok = patch_gold_dashboard(metrics, dry_run=args.dry_run)
        results.append(("GOLD_MASTER_DASHBOARD patch", ok))

    # Write to Redis (best-effort)
    if not args.json_only:
        ok = write_to_redis(metrics, dry_run=args.dry_run)
        results.append(("Redis write", ok))

    # Summary
    passed = sum(1 for _, ok in results if ok)
    failed = len(results) - passed
    log.info("Curator complete: %d/%d tasks succeeded", passed, len(results))
    for name, ok in results:
        status = "OK" if ok else "FAILED"
        log.info("  [%s] %s", status, name)

    if args.dry_run:
        print("\n" + format_status_report(metrics))

    return 0 if failed == 0 else 1


if __name__ == "__main__":
    sys.exit(main())