#!/usr/bin/env python3
"""
AIVA Watchdog - System Hardening Component
==========================================
Monitors AIVA processes and auto-restarts on failure.
Implements health checks and alerting.

Part of Genesis System Hardening (Post-Mortem 2026-01-14)
"""

import os
import sys
import time
import json
import signal
import subprocess
import logging
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

# Try to import redis
try:
    import redis
    HAS_REDIS = True
except ImportError:
    HAS_REDIS = False

# Configuration
GENESIS_DIR = Path("/mnt/e/genesis-system")
VENV_PYTHON = GENESIS_DIR / ".venv/bin/python"
LOG_DIR = GENESIS_DIR / "data"
CONFIG_PATH = GENESIS_DIR / "config/secrets.env"

# Process definitions
PROCESSES = {
    "aiva_orchestrator": {
        "script": "core/aiva_orchestrator.py",
        "args": ["start"],
        "log": "aiva_orchestrator.log",
        "critical": True,
        "restart_delay": 10,
        "max_restarts": 5,
        "restart_window": 300,  # 5 minutes
    },
    "aiva_daemon": {
        "script": "AIVA/aiva_daemon.py",
        "args": [],
        "log": "aiva_daemon.log",
        "critical": True,
        "restart_delay": 5,
        "max_restarts": 3,
        "restart_window": 300,
    },
}

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [WATCHDOG] %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler(LOG_DIR / "watchdog.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("watchdog")


class RedisHealth:
    """Redis-based health reporting."""

    def __init__(self):
        self.redis = None
        if HAS_REDIS:
            self._load_config()
            self._connect()

    def _load_config(self):
        """Load Redis config from secrets."""
        self.config = {
            "host": "redis-genesis-u50607.vm.elestio.app",
            "port": 26379,
            "password": "e2ZyYYr4oWRdASI2CaLc-",
            "decode_responses": True,
        }

        if CONFIG_PATH.exists():
            with open(CONFIG_PATH) as f:
                for line in f:
                    if "REDIS_HOST" in line:
                        self.config["host"] = line.split("=")[1].strip().strip('"')
                    elif "REDIS_PORT" in line:
                        self.config["port"] = int(line.split("=")[1].strip().strip('"'))
                    elif "REDIS_PASSWORD" in line:
                        self.config["password"] = line.split("=")[1].strip().strip('"')

    def _connect(self):
        """Connect to Redis."""
        try:
            self.redis = redis.Redis(**self.config)
            self.redis.ping()
            logger.info("Redis health reporting connected")
        except Exception as e:
            logger.warning(f"Redis connection failed: {e}")
            self.redis = None

    def report_health(self, process_name: str, status: str, details: Dict = None):
        """Report process health to Redis."""
        if not self.redis:
            return

        try:
            health_data = {
                "process": process_name,
                "status": status,
                "timestamp": datetime.now().isoformat(),
                "details": details or {},
            }

            # Store in hash
            self.redis.hset(
                "genesis:health",
                process_name,
                json.dumps(health_data)
            )

            # Set expiry key for freshness detection
            self.redis.setex(
                f"genesis:health:{process_name}:alive",
                60,  # 1 minute TTL
                "1"
            )

        except Exception as e:
            logger.warning(f"Failed to report health: {e}")

    def send_alert(self, message: str, level: str = "warning"):
        """Send alert via Redis pub/sub."""
        if not self.redis:
            return

        try:
            alert = {
                "type": "watchdog_alert",
                "level": level,
                "message": message,
                "timestamp": datetime.now().isoformat(),
            }
            self.redis.publish("genesis:alerts", json.dumps(alert))
            self.redis.publish("genesis:nervous_system", json.dumps(alert))

        except Exception as e:
            logger.warning(f"Failed to send alert: {e}")


class ProcessManager:
    """Manages a single process with restart logic."""

    def __init__(self, name: str, config: Dict, health: RedisHealth):
        self.name = name
        self.config = config
        self.health = health
        self.process: Optional[subprocess.Popen] = None
        self.restart_times = []
        self.total_restarts = 0

    def is_running(self) -> bool:
        """Check if process is running."""
        # Check our tracked process
        if self.process and self.process.poll() is None:
            return True

        # Check system-wide by name
        try:
            result = subprocess.run(
                ["pgrep", "-f", self.config["script"]],
                capture_output=True,
                text=True
            )
            return result.returncode == 0
        except Exception:
            return False

    def get_pid(self) -> Optional[int]:
        """Get process PID."""
        if self.process and self.process.poll() is None:
            return self.process.pid

        try:
            result = subprocess.run(
                ["pgrep", "-f", self.config["script"]],
                capture_output=True,
                text=True
            )
            if result.returncode == 0:
                pids = result.stdout.strip().split('\n')
                return int(pids[0]) if pids else None
        except Exception:
            return None

        return None

    def can_restart(self) -> bool:
        """Check if we can restart (haven't exceeded limits)."""
        now = time.time()
        window = self.config["restart_window"]
        max_restarts = self.config["max_restarts"]

        # Clean old restart times
        self.restart_times = [t for t in self.restart_times if now - t < window]

        return len(self.restart_times) < max_restarts

    def start(self) -> bool:
        """Start the process."""
        if self.is_running():
            logger.info(f"{self.name} already running (PID: {self.get_pid()})")
            return True

        script_path = GENESIS_DIR / self.config["script"]
        if not script_path.exists():
            logger.error(f"Script not found: {script_path}")
            return False

        log_path = LOG_DIR / self.config["log"]

        try:
            # Build command
            cmd = [str(VENV_PYTHON), str(script_path)] + self.config["args"]

            # Start process
            with open(log_path, "a") as log_file:
                self.process = subprocess.Popen(
                    cmd,
                    stdout=log_file,
                    stderr=subprocess.STDOUT,
                    cwd=str(GENESIS_DIR),
                    start_new_session=True,  # Detach from parent
                )

            # Wait briefly and check if it's still running
            time.sleep(2)

            if self.process.poll() is None:
                logger.info(f"Started {self.name} (PID: {self.process.pid})")
                self.health.report_health(self.name, "running", {"pid": self.process.pid})
                self.health.send_alert(f"{self.name} started (PID: {self.process.pid})", "info")
                return True
            else:
                logger.error(f"Failed to start {self.name} - process exited immediately")
                return False

        except Exception as e:
            logger.error(f"Error starting {self.name}: {e}")
            return False

    def restart(self) -> bool:
        """Restart the process with rate limiting."""
        if not self.can_restart():
            logger.error(f"{self.name} exceeded restart limit ({self.config['max_restarts']} in {self.config['restart_window']}s)")
            self.health.send_alert(
                f"CRITICAL: {self.name} exceeded restart limit - manual intervention required",
                "critical"
            )
            return False

        self.restart_times.append(time.time())
        self.total_restarts += 1

        logger.warning(f"Restarting {self.name} (attempt {len(self.restart_times)}/{self.config['max_restarts']})")
        self.health.send_alert(f"{self.name} restarting (attempt {len(self.restart_times)})", "warning")

        # Wait before restart
        time.sleep(self.config["restart_delay"])

        return self.start()

    def check_and_recover(self) -> bool:
        """Check process and recover if needed."""
        if self.is_running():
            pid = self.get_pid()
            self.health.report_health(self.name, "running", {"pid": pid})
            return True

        logger.warning(f"{self.name} is not running - attempting recovery")
        self.health.report_health(self.name, "down")

        if self.config["critical"]:
            return self.restart()
        else:
            logger.info(f"{self.name} is not critical, skipping restart")
            return False


class AIVAWatchdog:
    """Main watchdog controller."""

    def __init__(self):
        self.health = RedisHealth()
        self.managers = {}
        self.running = True
        self.check_interval = 30  # seconds

        # Initialize process managers
        for name, config in PROCESSES.items():
            self.managers[name] = ProcessManager(name, config, self.health)

        # Setup signal handlers
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGTERM, self._signal_handler)

        logger.info("AIVA Watchdog initialized")

    def _signal_handler(self, signum, frame):
        """Handle shutdown signals."""
        logger.info(f"Received signal {signum}, shutting down watchdog...")
        self.running = False

    def start_all(self):
        """Start all managed processes."""
        logger.info("Starting all managed processes...")

        for name, manager in self.managers.items():
            if not manager.is_running():
                manager.start()
            else:
                logger.info(f"{name} already running")

    def check_all(self):
        """Check all processes and recover if needed."""
        for name, manager in self.managers.items():
            try:
                manager.check_and_recover()
            except Exception as e:
                logger.error(f"Error checking {name}: {e}")

    def get_status(self) -> Dict:
        """Get status of all processes."""
        status = {
            "timestamp": datetime.now().isoformat(),
            "watchdog": "running" if self.running else "stopped",
            "processes": {},
        }

        for name, manager in self.managers.items():
            status["processes"][name] = {
                "running": manager.is_running(),
                "pid": manager.get_pid(),
                "total_restarts": manager.total_restarts,
                "recent_restarts": len(manager.restart_times),
            }

        return status

    def run(self):
        """Main watchdog loop."""
        logger.info("=" * 60)
        logger.info("AIVA WATCHDOG STARTING")
        logger.info("=" * 60)

        self.health.send_alert("AIVA Watchdog started - monitoring processes", "info")

        # Initial start
        self.start_all()

        # Main monitoring loop
        while self.running:
            try:
                self.check_all()

                # Report overall health
                status = self.get_status()
                self.health.report_health("watchdog", "running", status)

                # Log status periodically
                running_count = sum(1 for p in status["processes"].values() if p["running"])
                total_count = len(status["processes"])
                logger.debug(f"Health check: {running_count}/{total_count} processes running")

            except Exception as e:
                logger.error(f"Watchdog loop error: {e}")

            # Wait for next check
            for _ in range(self.check_interval):
                if not self.running:
                    break
                time.sleep(1)

        logger.info("AIVA Watchdog stopped")
        self.health.send_alert("AIVA Watchdog stopped", "warning")


def main():
    """Entry point."""
    import argparse

    parser = argparse.ArgumentParser(description="AIVA Watchdog - Process Monitor")
    parser.add_argument("command", choices=["start", "status", "check"],
                       help="Command to run")
    parser.add_argument("--interval", type=int, default=30,
                       help="Check interval in seconds")

    args = parser.parse_args()

    if args.command == "start":
        watchdog = AIVAWatchdog()
        watchdog.check_interval = args.interval
        watchdog.run()

    elif args.command == "status":
        watchdog = AIVAWatchdog()
        status = watchdog.get_status()
        print(json.dumps(status, indent=2))

    elif args.command == "check":
        watchdog = AIVAWatchdog()
        watchdog.check_all()
        status = watchdog.get_status()

        print("Process Status:")
        for name, info in status["processes"].items():
            status_icon = "✅" if info["running"] else "❌"
            pid_str = f"PID: {info['pid']}" if info["pid"] else "not running"
            print(f"  {status_icon} {name}: {pid_str} (restarts: {info['total_restarts']})")


if __name__ == "__main__":
    main()
