#!/usr/bin/env python3
"""
GCCWatchdog — Heartbeat Monitor and Auto-Respawn for All 5 Command Centres
===========================================================================
Runs as a separate daemon (in its own tmux session or background process).
Polls heartbeat files every 30 seconds. Respawns any centre whose heartbeat
is stale (older than STALE_THRESHOLD_SECONDS = 120 s).

Author: Genesis Parallel Builder
Created: 2026-02-26
"""

import json
import logging
import signal
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

from core.gemini_command_centres.launcher import (
    CENTRES_CONFIG,
    GCC_HEARTBEATS_DIR,
    STALE_THRESHOLD_SECONDS,
    launch_one,
)

logger = logging.getLogger(__name__)

# ─── Watchdog constants ───────────────────────────────────────────────────────
WATCHDOG_POLL_INTERVAL = 30  # seconds
WATCHDOG_TMUX_SESSION = "gcc-watchdog"
MAX_RESPAWN_ATTEMPTS = 10  # per centre per watchdog lifetime
RESPAWN_COOLDOWN = 60  # seconds between respawn attempts for same centre


class GCCWatchdog:
    """
    Monitors all 5 command centres. Respawns if heartbeat stale > 2 min.

    Tracks per-centre respawn counts and enforces a cooldown between
    successive respawns to avoid thrashing.
    """

    def __init__(
        self,
        poll_interval: int = WATCHDOG_POLL_INTERVAL,
        stale_threshold: int = STALE_THRESHOLD_SECONDS,
    ) -> None:
        self.poll_interval = poll_interval
        self.stale_threshold = stale_threshold
        self.running = True

        # Per-centre state
        self._respawn_counts: dict[str, int] = {c["name"]: 0 for c in CENTRES_CONFIG}
        self._last_respawn: dict[str, Optional[float]] = {
            c["name"]: None for c in CENTRES_CONFIG
        }

        signal.signal(signal.SIGTERM, self._signal_handler)
        signal.signal(signal.SIGINT, self._signal_handler)

    def _signal_handler(self, signum: int, frame: Any) -> None:
        logger.info("GCCWatchdog received signal %d — stopping", signum)
        self.running = False

    # ─── Main Loop ────────────────────────────────────────────────────────────

    def run(self) -> None:
        """
        Continuous monitoring loop.

        Every WATCHDOG_POLL_INTERVAL seconds:
        1. Check each centre's heartbeat file.
        2. If stale → respawn (subject to cooldown and max attempts).
        3. Log summary.
        """
        logger.info(
            "GCCWatchdog started — monitoring %d centres, poll=%ds, stale=%ds",
            len(CENTRES_CONFIG),
            self.poll_interval,
            self.stale_threshold,
        )

        while self.running:
            for centre in CENTRES_CONFIG:
                name = centre["name"]
                hb_file = GCC_HEARTBEATS_DIR / f"{name}.json"

                if not hb_file.exists():
                    logger.warning(
                        "GCCWatchdog: no heartbeat for %r — respawning", name
                    )
                    self._maybe_respawn(centre)
                    continue

                if self._is_stale(hb_file):
                    age = self._heartbeat_age(hb_file)
                    logger.warning(
                        "GCCWatchdog: %r heartbeat stale (age=%.0fs) — respawning",
                        name,
                        age or -1,
                    )
                    self._maybe_respawn(centre)
                else:
                    logger.debug("GCCWatchdog: %r OK", name)

            time.sleep(self.poll_interval)

        logger.info("GCCWatchdog stopped")

    # ─── Heartbeat Checks ─────────────────────────────────────────────────────

    def _is_stale(self, hb_file: Path) -> bool:
        """
        Return True if the heartbeat file's last-written timestamp is older
        than stale_threshold seconds.

        Args:
            hb_file: Path to the heartbeat JSON file.

        Returns:
            True if stale, False if fresh.
        """
        age = self._heartbeat_age(hb_file)
        if age is None:
            return True  # Can't read = stale
        return age > self.stale_threshold

    def _heartbeat_age(self, hb_file: Path) -> Optional[float]:
        """
        Return the age in seconds of the timestamp recorded inside the heartbeat file.

        Falls back to file mtime if JSON parse fails.

        Args:
            hb_file: Path to heartbeat JSON.

        Returns:
            Age in seconds, or None on error.
        """
        try:
            with open(hb_file, "r", encoding="utf-8") as fh:
                data = json.load(fh)
            ts_str = data.get("timestamp", "")
            if ts_str:
                last_ts = datetime.fromisoformat(ts_str)
                return (datetime.now(timezone.utc) - last_ts).total_seconds()
        except (json.JSONDecodeError, ValueError, OSError):
            pass

        # Fallback: file mtime
        try:
            mtime = hb_file.stat().st_mtime
            return time.time() - mtime
        except OSError:
            return None

    # ─── Respawn ──────────────────────────────────────────────────────────────

    def _maybe_respawn(self, centre: dict) -> None:
        """
        Respawn a centre if not in cooldown and under max attempts.

        Args:
            centre: Centre config dict from CENTRES_CONFIG.
        """
        name = centre["name"]
        now = time.monotonic()

        # Cooldown check
        last = self._last_respawn.get(name)
        if last is not None and (now - last) < RESPAWN_COOLDOWN:
            logger.info(
                "GCCWatchdog: %r in respawn cooldown (%.0fs remaining)",
                name,
                RESPAWN_COOLDOWN - (now - last),
            )
            return

        # Max attempts check
        count = self._respawn_counts.get(name, 0)
        if count >= MAX_RESPAWN_ATTEMPTS:
            logger.error(
                "GCCWatchdog: %r has exceeded max respawn attempts (%d) — NOT respawning",
                name,
                MAX_RESPAWN_ATTEMPTS,
            )
            return

        self._respawn(centre)

    def _respawn(self, centre: dict) -> None:
        """
        Kill old tmux session and relaunch daemon.

        Args:
            centre: Centre config dict.
        """
        name = centre["name"]
        success = launch_one(name=name, verbose=False)

        self._respawn_counts[name] = self._respawn_counts.get(name, 0) + 1
        self._last_respawn[name] = time.monotonic()

        if success:
            logger.info(
                "GCCWatchdog: respawned %r (attempt %d/%d)",
                name,
                self._respawn_counts[name],
                MAX_RESPAWN_ATTEMPTS,
            )
        else:
            logger.error(
                "GCCWatchdog: failed to respawn %r", name
            )

    # ─── Status ───────────────────────────────────────────────────────────────

    def get_status(self) -> dict:
        """
        Return current watchdog status including per-centre respawn counts.

        Returns:
            Status dict.
        """
        return {
            "watchdog": "running" if self.running else "stopped",
            "poll_interval": self.poll_interval,
            "stale_threshold": self.stale_threshold,
            "centres": {
                name: {
                    "respawn_count": self._respawn_counts.get(name, 0),
                    "last_respawn": self._last_respawn.get(name),
                }
                for name in [c["name"] for c in CENTRES_CONFIG]
            },
        }


def run_watchdog() -> None:
    """Entry point to start the GCCWatchdog process."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        stream=sys.stdout,
    )
    watchdog = GCCWatchdog()
    watchdog.run()
