#!/usr/bin/env python3
"""
SPRINT WATCHDOG
===============
Monitors the Queen Elevation Sprint and ensures continuity.
Auto-recovers from failures, resumes from checkpoints.

Run: python3 sprint_watchdog.py
"""

import os
import sys
import json
import time
import subprocess
import signal
from datetime import datetime, timedelta
from pathlib import Path

# Paths
BASE_DIR = Path("/mnt/e/genesis-system/AIVA")
CHECKPOINT_DIR = BASE_DIR / "sprint-checkpoints"
STATUS_LOG = BASE_DIR / "sprint_status.log"
TOKEN_LOG = BASE_DIR / "token_usage.jsonl"
WATCHDOG_LOG = BASE_DIR / "watchdog.log"
PID_FILE = BASE_DIR / "sprint.pid"
EMERGENCY_STOP_FILE = BASE_DIR / "EMERGENCY_STOP"

# Configuration
CHECK_INTERVAL = 300  # 5 minutes
MAX_STALL_TIME = 1800  # 30 minutes without progress = stalled
BUDGET_LIMIT = 10.00
EMERGENCY_THRESHOLD = 9.50


def log(message: str):
    """Log with timestamp."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    entry = f"[{timestamp}] WATCHDOG: {message}"
    print(entry)
    with open(WATCHDOG_LOG, "a") as f:
        f.write(entry + "\n")


def get_sprint_pid() -> int:
    """Get the PID of the running sprint process."""
    if PID_FILE.exists():
        try:
            pid = int(PID_FILE.read_text().strip())
            # Check if process is still running
            os.kill(pid, 0)  # Signal 0 doesn't kill, just checks
            return pid
        except (ValueError, ProcessLookupError, PermissionError):
            pass
    return None


def is_sprint_running() -> bool:
    """Check if sprint process is alive."""
    pid = get_sprint_pid()
    if pid:
        return True

    # Also check by process name
    try:
        result = subprocess.run(
            ["pgrep", "-f", "queen_elevation_sprint.py"],
            capture_output=True, text=True
        )
        return bool(result.stdout.strip())
    except Exception:
        return False


def get_last_activity_time() -> datetime:
    """Get timestamp of last activity."""
    latest = None

    # Check token log
    if TOKEN_LOG.exists():
        try:
            with open(TOKEN_LOG, "r") as f:
                for line in f:
                    try:
                        entry = json.loads(line.strip())
                        ts = datetime.fromisoformat(entry.get("timestamp", ""))
                        if latest is None or ts > latest:
                            latest = ts
                    except (json.JSONDecodeError, ValueError):
                        continue
        except Exception:
            pass

    # Check status log modification time
    if STATUS_LOG.exists():
        mtime = datetime.fromtimestamp(STATUS_LOG.stat().st_mtime)
        if latest is None or mtime > latest:
            latest = mtime

    # Check checkpoint files
    if CHECKPOINT_DIR.exists():
        for cp in CHECKPOINT_DIR.glob("*.json"):
            mtime = datetime.fromtimestamp(cp.stat().st_mtime)
            if latest is None or mtime > latest:
                latest = mtime

    return latest


def get_current_spend() -> float:
    """Calculate total spend from token log."""
    total_cost = 0.0

    if TOKEN_LOG.exists():
        try:
            with open(TOKEN_LOG, "r") as f:
                for line in f:
                    try:
                        entry = json.loads(line.strip())
                        total_cost = entry.get("cumulative_cost", total_cost)
                    except json.JSONDecodeError:
                        continue
        except Exception:
            pass

    return total_cost


def get_last_checkpoint() -> dict:
    """Get the most recent checkpoint."""
    if not CHECKPOINT_DIR.exists():
        return None

    latest_cp = None
    latest_time = None

    for cp_file in CHECKPOINT_DIR.glob("*.json"):
        mtime = cp_file.stat().st_mtime
        if latest_time is None or mtime > latest_time:
            latest_time = mtime
            try:
                with open(cp_file, "r") as f:
                    latest_cp = json.load(f)
                    latest_cp["_checkpoint_file"] = str(cp_file)
            except Exception:
                continue

    return latest_cp


def restart_sprint(from_checkpoint: bool = False):
    """Restart the sprint, optionally from checkpoint."""
    log("Initiating sprint restart...")

    # Kill any existing process
    pid = get_sprint_pid()
    if pid:
        try:
            os.kill(pid, signal.SIGTERM)
            time.sleep(2)
        except ProcessLookupError:
            pass

    # Start new sprint
    cmd = ["python3", str(BASE_DIR / "queen_elevation_sprint.py")]

    if from_checkpoint:
        checkpoint = get_last_checkpoint()
        if checkpoint:
            log(f"Resuming from checkpoint: {checkpoint.get('_checkpoint_file')}")
            # The sprint executor will auto-detect checkpoints

    # Start in background
    process = subprocess.Popen(
        cmd,
        stdout=open(STATUS_LOG, "a"),
        stderr=subprocess.STDOUT,
        cwd=str(BASE_DIR)
    )

    # Save PID
    PID_FILE.write_text(str(process.pid))
    log(f"Sprint started with PID: {process.pid}")


def should_stop() -> bool:
    """Check if we should stop the sprint."""
    # Check emergency stop file
    if EMERGENCY_STOP_FILE.exists():
        log("Emergency stop file detected")
        return True

    # Check budget
    spend = get_current_spend()
    if spend >= EMERGENCY_THRESHOLD:
        log(f"Budget threshold reached: ${spend:.2f} >= ${EMERGENCY_THRESHOLD:.2f}")
        return True

    return False


def main():
    """Main watchdog loop."""
    log("="*60)
    log("SPRINT WATCHDOG INITIALIZED")
    log(f"Check interval: {CHECK_INTERVAL}s")
    log(f"Max stall time: {MAX_STALL_TIME}s")
    log(f"Budget limit: ${BUDGET_LIMIT}")
    log("="*60)

    restart_count = 0
    max_restarts = 5

    while True:
        try:
            # Check for emergency stop
            if should_stop():
                log("Emergency stop triggered - exiting watchdog")
                break

            # Check if sprint is running
            if not is_sprint_running():
                log("Sprint not running!")

                # Check if we have a checkpoint to resume from
                checkpoint = get_last_checkpoint()
                if checkpoint:
                    completed_phases = checkpoint.get("checkpoints_completed", [])
                    if 5 in completed_phases:
                        log("Sprint completed successfully (all phases done)")
                        break

                    if restart_count < max_restarts:
                        log(f"Restarting from checkpoint (attempt {restart_count + 1}/{max_restarts})")
                        restart_sprint(from_checkpoint=True)
                        restart_count += 1
                    else:
                        log(f"Max restarts ({max_restarts}) exceeded - stopping watchdog")
                        break
                else:
                    if restart_count < max_restarts:
                        log(f"Starting fresh sprint (attempt {restart_count + 1}/{max_restarts})")
                        restart_sprint(from_checkpoint=False)
                        restart_count += 1
                    else:
                        log(f"Max restarts ({max_restarts}) exceeded - stopping watchdog")
                        break

            else:
                # Sprint is running - check for stall
                last_activity = get_last_activity_time()
                if last_activity:
                    stall_duration = (datetime.now() - last_activity).total_seconds()
                    if stall_duration > MAX_STALL_TIME:
                        log(f"Sprint stalled for {stall_duration/60:.1f} minutes")
                        if restart_count < max_restarts:
                            log("Forcing restart from checkpoint...")
                            restart_sprint(from_checkpoint=True)
                            restart_count += 1

                # Log current status
                spend = get_current_spend()
                log(f"Sprint active | Spend: ${spend:.2f} | Restarts: {restart_count}")

            # Reset restart counter on successful activity
            if is_sprint_running():
                last_activity = get_last_activity_time()
                if last_activity:
                    time_since = (datetime.now() - last_activity).total_seconds()
                    if time_since < CHECK_INTERVAL:
                        restart_count = max(0, restart_count - 1)

            time.sleep(CHECK_INTERVAL)

        except KeyboardInterrupt:
            log("Watchdog interrupted by user")
            break
        except Exception as e:
            log(f"Watchdog error: {e}")
            time.sleep(60)  # Wait a bit before retrying

    log("Watchdog exiting")


if __name__ == "__main__":
    main()
