#!/usr/bin/env python3
"""
GENESIS SELF-HEALING WATCHDOG
=============================
Monitors Genesis components and automatically restarts failed services.

Features:
- Component health monitoring
- Automatic restart on failure
- Exponential backoff for repeated failures
- Alert notifications
- Recovery logging

Usage:
    python self_healing_watchdog.py          # Run watchdog
    python self_healing_watchdog.py --once   # Check once and exit
    python self_healing_watchdog.py --status # Show component status
"""

import json
import os
import subprocess
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Callable, Any


class ComponentStatus(Enum):
    """Component health status."""
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    FAILED = "failed"
    RESTARTING = "restarting"
    UNKNOWN = "unknown"


@dataclass
class Component:
    """A monitored component."""
    name: str
    check_func: Callable[[], bool]
    restart_func: Optional[Callable[[], bool]] = None
    max_retries: int = 3
    retry_count: int = 0
    last_failure: Optional[datetime] = None
    backoff_seconds: int = 10
    status: ComponentStatus = ComponentStatus.UNKNOWN
    enabled: bool = True


@dataclass
class WatchdogState:
    """Watchdog state for persistence."""
    components: Dict[str, Dict] = field(default_factory=dict)
    last_check: Optional[str] = None
    total_restarts: int = 0
    total_failures: int = 0


class SelfHealingWatchdog:
    """
    Self-healing watchdog for Genesis components.

    Monitors component health and automatically attempts recovery
    when failures are detected.
    """

    def __init__(self, state_path: str = None):
        self.genesis_root = Path(__file__).parent.parent
        self.state_path = Path(state_path) if state_path else (
            self.genesis_root / "data" / "watchdog_state.json"
        )
        self.state = self._load_state()
        self.components: List[Component] = []
        self._register_components()

    def _load_state(self) -> WatchdogState:
        """Load watchdog state from file."""
        if self.state_path.exists():
            try:
                with open(self.state_path) as f:
                    data = json.load(f)
                return WatchdogState(**data)
            except Exception:
                pass
        return WatchdogState()

    def _save_state(self):
        """Save watchdog state to file."""
        self.state_path.parent.mkdir(parents=True, exist_ok=True)
        self.state.last_check = datetime.now().isoformat()

        data = {
            "components": self.state.components,
            "last_check": self.state.last_check,
            "total_restarts": self.state.total_restarts,
            "total_failures": self.state.total_failures
        }

        with open(self.state_path, "w") as f:
            json.dump(data, f, indent=2)

    def _register_components(self):
        """Register all monitored components."""
        self.components = [
            Component(
                name="Genesis Kernel",
                check_func=self._check_kernel,
                restart_func=self._restart_kernel
            ),
            Component(
                name="Hyperdrive Controller",
                check_func=self._check_hyperdrive,
            ),
            Component(
                name="Gemini Executor",
                check_func=self._check_gemini,
            ),
            Component(
                name="Task Queue",
                check_func=self._check_tasks,
            ),
            Component(
                name="Memory System",
                check_func=self._check_memory,
            ),
            Component(
                name="Configuration",
                check_func=self._check_config,
            ),
        ]

    # ============== Health Check Functions ==============

    def _check_kernel(self) -> bool:
        """Check if Genesis kernel is operational."""
        try:
            import genesis_kernel
            return True
        except ImportError:
            return False

    def _check_hyperdrive(self) -> bool:
        """Check Hyperdrive controller."""
        try:
            from hyperdrive_controller import HyperdriveController
            return True
        except ImportError:
            return False

    def _check_gemini(self) -> bool:
        """Check Gemini executor."""
        try:
            os.environ['GEMINI_API_KEY'] = 'AIzaSyALfbAdHfJ6aRnqNyiTRmKmGVoena1JsdU'
            from gemini_executor import GeminiExecutor
            executor = GeminiExecutor()
            return executor.api_key is not None
        except Exception:
            return False

    def _check_tasks(self) -> bool:
        """Check task queue file."""
        tasks_path = self.genesis_root / "loop" / "tasks.json"
        if not tasks_path.exists():
            return False

        try:
            with open(tasks_path) as f:
                data = json.load(f)
            return "stories" in data
        except Exception:
            return False

    def _check_memory(self) -> bool:
        """Check memory system files."""
        data_dir = self.genesis_root / "data"
        required = ["heartbeat_state.json", "kernel_state.json"]

        for filename in required:
            if (data_dir / filename).exists():
                return True
        return False

    def _check_config(self) -> bool:
        """Check configuration files."""
        config_path = self.genesis_root / "config" / "genesis_config.json"
        return config_path.exists()

    # ============== Restart Functions ==============

    def _restart_kernel(self) -> bool:
        """Attempt to restart Genesis kernel."""
        try:
            # Reset kernel state
            state_path = self.genesis_root / "data" / "kernel_state.json"
            if state_path.exists():
                with open(state_path) as f:
                    state = json.load(f)
                state["status"] = "restarting"
                state["restart_time"] = datetime.now().isoformat()
                with open(state_path, "w") as f:
                    json.dump(state, f)

            # Verify kernel can import
            import importlib
            import genesis_kernel
            importlib.reload(genesis_kernel)

            return True
        except Exception as e:
            print(f"[Watchdog] Kernel restart failed: {e}")
            return False

    # ============== Watchdog Core ==============

    def check_component(self, component: Component) -> ComponentStatus:
        """Check a single component's health."""
        try:
            healthy = component.check_func()

            if healthy:
                component.status = ComponentStatus.HEALTHY
                component.retry_count = 0
                component.last_failure = None
            else:
                component.status = ComponentStatus.FAILED
                component.last_failure = datetime.now()

            return component.status

        except Exception as e:
            print(f"[Watchdog] Error checking {component.name}: {e}")
            component.status = ComponentStatus.FAILED
            component.last_failure = datetime.now()
            return ComponentStatus.FAILED

    def attempt_recovery(self, component: Component) -> bool:
        """Attempt to recover a failed component."""
        if not component.restart_func:
            print(f"[Watchdog] No restart function for {component.name}")
            return False

        if component.retry_count >= component.max_retries:
            print(f"[Watchdog] Max retries exceeded for {component.name}")
            return False

        # Exponential backoff
        if component.last_failure:
            backoff = component.backoff_seconds * (2 ** component.retry_count)
            elapsed = (datetime.now() - component.last_failure).total_seconds()
            if elapsed < backoff:
                print(f"[Watchdog] Waiting {backoff - elapsed:.0f}s before retry")
                return False

        print(f"[Watchdog] Attempting recovery for {component.name} "
              f"(attempt {component.retry_count + 1}/{component.max_retries})")

        component.status = ComponentStatus.RESTARTING
        component.retry_count += 1
        self.state.total_restarts += 1

        try:
            success = component.restart_func()

            if success:
                print(f"[Watchdog] Recovery successful for {component.name}")
                component.status = ComponentStatus.HEALTHY
                component.retry_count = 0
                return True
            else:
                print(f"[Watchdog] Recovery failed for {component.name}")
                component.status = ComponentStatus.FAILED
                self.state.total_failures += 1
                return False

        except Exception as e:
            print(f"[Watchdog] Recovery error for {component.name}: {e}")
            component.status = ComponentStatus.FAILED
            self.state.total_failures += 1
            return False

    def run_checks(self) -> Dict[str, ComponentStatus]:
        """Run all health checks."""
        results = {}

        for component in self.components:
            if not component.enabled:
                continue

            status = self.check_component(component)
            results[component.name] = status

            # Update state
            self.state.components[component.name] = {
                "status": status.value,
                "retry_count": component.retry_count,
                "last_failure": component.last_failure.isoformat() if component.last_failure else None
            }

            # Attempt recovery if failed
            if status == ComponentStatus.FAILED:
                self.attempt_recovery(component)

        self._save_state()
        return results

    def run_continuous(self, interval: int = 60):
        """Run watchdog continuously."""
        print("\n" + "=" * 50)
        print("GENESIS SELF-HEALING WATCHDOG")
        print("=" * 50)
        print(f"Monitoring {len(self.components)} components")
        print(f"Check interval: {interval}s")
        print("Press Ctrl+C to stop")
        print("=" * 50 + "\n")

        try:
            while True:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                print(f"\n[{timestamp}] Running health checks...")

                results = self.run_checks()

                # Summary
                healthy = sum(1 for s in results.values() if s == ComponentStatus.HEALTHY)
                total = len(results)

                print(f"Status: {healthy}/{total} components healthy")

                for name, status in results.items():
                    icon = {"healthy": "[OK]", "failed": "[XX]", "restarting": "[..]"}.get(status.value, "[??]")
                    print(f"  {icon} {name}")

                time.sleep(interval)

        except KeyboardInterrupt:
            print("\n[Watchdog] Shutting down...")

    def display_status(self):
        """Display current component status."""
        results = self.run_checks()

        print("\n" + "=" * 50)
        print("GENESIS WATCHDOG STATUS")
        print("=" * 50)
        print(f"Last check: {self.state.last_check or 'Never'}")
        print(f"Total restarts: {self.state.total_restarts}")
        print(f"Total failures: {self.state.total_failures}")
        print("-" * 50)

        for component in self.components:
            status_icon = {
                ComponentStatus.HEALTHY: "[OK]",
                ComponentStatus.DEGRADED: "[!!]",
                ComponentStatus.FAILED: "[XX]",
                ComponentStatus.RESTARTING: "[..]",
                ComponentStatus.UNKNOWN: "[??]"
            }.get(component.status, "[??]")

            print(f"\n{status_icon} {component.name}")
            print(f"    Status: {component.status.value}")
            print(f"    Retries: {component.retry_count}/{component.max_retries}")
            if component.last_failure:
                print(f"    Last failure: {component.last_failure}")

        print("\n" + "=" * 50)


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Genesis Self-Healing Watchdog")
    parser.add_argument("--once", action="store_true", help="Run once and exit")
    parser.add_argument("--status", action="store_true", help="Show status")
    parser.add_argument("--interval", type=int, default=60, help="Check interval (seconds)")
    args = parser.parse_args()

    watchdog = SelfHealingWatchdog()

    if args.status:
        watchdog.display_status()
    elif args.once:
        watchdog.run_checks()
        watchdog.display_status()
    else:
        watchdog.run_continuous(interval=args.interval)


if __name__ == "__main__":
    main()
