# health_monitor.py
"""
Continuous health checking of all bridge components.
"""

import asyncio
import logging
import psutil
import socket
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from enum import Enum
from dataclasses import dataclass, field

import psycopg2
from psycopg2 import pool
import aiohttp

logger = logging.getLogger(__name__)


class HealthStatus(Enum):
    """Health status levels."""
    HEALTHY = "HEALTHY"
    DEGRADED = "DEGRADED"
    UNHEALTHY = "UNHEALTHY"
    UNKNOWN = "UNKNOWN"


@dataclass
class HealthCheckResult:
    """Health check result."""
    component: str
    status: HealthStatus
    message: str
    details: Dict[str, Any] = field(default_factory=dict)
    checked_at: datetime = field(default_factory=datetime.utcnow)
    response_time_ms: float = 0.0


class HealthMonitor:
    """Continuous health monitoring for bridge components."""
    
    def __init__(
        self,
        db_pool: pool.ThreadedConnectionPool,
        telnyx_api_key: str,
        claude_code_url: str = "http://localhost:8080/health",
        check_interval: int = 30
    ):
        self.db_pool = db_pool
        self.telnyx_api_key = telnyx_api_key
        self.claude_code_url = claude_code_url
        self.check_interval = check_interval
        
        self._health_history: List[HealthCheckResult] = []
        self._max_history = 1000
        self._running = False
        self._last_checks: Dict[str, HealthCheckResult] = {}
        
        self._setup_health_table()
    
    def _setup_health_table(self):
        """Create health check history table."""
        try:
            with self.db_pool.getconn() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        CREATE TABLE IF NOT EXISTS genesis_bridge.health_check_history (
                            id SERIAL PRIMARY KEY,
                            component VARCHAR(255) NOT NULL,
                            status VARCHAR(50) NOT NULL,
                            message TEXT,
                            details JSONB,
                            response_time_ms FLOAT,
                            checked_at TIMESTAMP DEFAULT NOW()
                        );
                        
                        CREATE INDEX IF NOT EXISTS idx_health_history_component 
                            ON genesis_bridge.health_check_history(component, checked_at);
                    """)
                    conn.commit()
        except Exception as e:
            logger.error(f"Failed to setup health table: {e}")
    
    async def check_postgresql(self) -> HealthCheckResult:
        """Check PostgreSQL connectivity."""
        start = datetime.utcnow()
        
        try:
            conn = self.db_pool.getconn()
            try:
                with conn.cursor() as cur:
                    cur.execute("SELECT 1, version()")
                    row = cur.fetchone()
                    
                    # Check connection pool stats
                    pool_stats = {
                        "min_connections": self.db_pool.minconn,
                        "max_connections": self.db_pool.maxconn
                    }
                    
                    return HealthCheckResult(
                        component="postgresql",
                        status=HealthStatus.HEALTHY,
                        message="PostgreSQL is responsive",
                        details={"version": row[1], "pool_stats": pool_stats},
                        response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
                    )
            finally:
                self.db_pool.putconn(conn)
                
        except Exception as e:
            return HealthCheckResult(
                component="postgresql",
                status=HealthStatus.UNHEALTHY,
                message=f"PostgreSQL connection failed: {str(e)}",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
    
    async def check_telnyx(self) -> HealthCheckResult:
        """Check Telnyx API connectivity."""
        start = datetime.utcnow()
        
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    "https://api.telnyx.com/v2/voice_assistants",
                    headers={"Authorization": f"Bearer {self.telnyx_api_key}"},
                    timeout=aiohttp.ClientTimeout(total=10)
                ) as response:
                    response_time = (datetime.utcnow() - start).total_seconds() * 1000
                    
                    if response.status == 200:
                        return HealthCheckResult(
                            component="telnyx",
                            status=HealthStatus.HEALTHY,
                            message="Telnyx API is responsive",
                            details={"status_code": response.status},
                            response_time_ms=response_time
                        )
                    else:
                        return HealthCheckResult(
                            component="telnyx",
                            status=HealthStatus.DEGRADED,
                            message=f"Telnyx API returned {response.status}",
                            details={"status_code": response.status},
                            response_time_ms=response_time
                        )
                        
        except asyncio.TimeoutError:
            return HealthCheckResult(
                component="telnyx",
                status=HealthStatus.UNHEALTHY,
                message="Telnyx API timeout",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
        except Exception as e:
            return HealthCheckResult(
                component="telnyx",
                status=HealthStatus.UNHEALTHY,
                message=f"Telnyx API error: {str(e)}",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
    
    async def check_claude_code(self) -> HealthCheckResult:
        """Check Claude Code service health."""
        start = datetime.utcnow()
        
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    self.claude_code_url,
                    timeout=aiohttp.ClientTimeout(total=5)
                ) as response:
                    response_time = (datetime.utcnow() - start).total_seconds() * 1000
                    
                    if response.status == 200:
                        data = await response.json()
                        return HealthCheckResult(
                            component="claude_code",
                            status=HealthStatus.HEALTHY,
                            message="Claude Code is responsive",
                            details=data,
                            response_time_ms=response_time
                        )
                    else:
                        return HealthCheckResult(
                            component="claude_code",
                            status=HealthStatus.DEGRADED,
                            message=f"Claude Code returned {response.status}",
                            details={"status_code": response.status},
                            response_time_ms=response_time
                        )
                        
        except asyncio.TimeoutError:
            return HealthCheckResult(
                component="claude_code",
                status=HealthStatus.UNHEALTHY,
                message="Claude Code timeout",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
        except Exception as e:
            return HealthCheckResult(
                component="claude_code",
                status=HealthStatus.UNHEALTHY,
                message=f"Claude Code error: {str(e)}",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
    
    async def check_system_resources(self) -> HealthCheckResult:
        """Check system resources."""
        start = datetime.utcnow()
        
        try:
            cpu_percent = psutil.cpu_percent(interval=1)
            memory = psutil.virtual_memory()
            disk = psutil.disk_usage('/')
            
            # Determine status based on thresholds
            status = HealthStatus.HEALTHY
            issues = []
            
            if cpu_percent > 90:
                status = HealthStatus.DEGRADED
                issues.append(f"High CPU: {cpu_percent}%")
            elif cpu_percent > 80:
                status = HealthStatus.DEGRADED
            
            if memory.percent > 90:
                status = HealthStatus.DEGRADED
                issues.append(f"High memory: {memory.percent}%")
            elif memory.percent > 80:
                status = HealthStatus.DEGRADED
            
            if disk.percent > 90:
                status = HealthStatus.DEGRADED
                issues.append(f"Low disk: {100 - disk.percent}% free")
            
            details = {
                "cpu_percent": cpu_percent,
                "memory_percent": memory.percent,
                "memory_available_mb": memory.available / (1024 * 1024),
                "disk_percent": disk.percent,
                "disk_free_gb": disk.free / (1024 * 1024 * 1024)
            }
            
            return HealthCheckResult(
                component="system",
                status=status,
                message="; ".join(issues) if issues else "System resources OK",
                details=details,
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
            
        except Exception as e:
            return HealthCheckResult(
                component="system",
                status=HealthStatus.UNKNOWN,
                message=f"Failed to check system resources: {str(e)}",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
    
    async def check_network(self) -> HealthCheckResult:
        """Check network connectivity."""
        start = datetime.utcnow()
        
        try:
            # Try to resolve DNS
            socket.setdefaulttimeout(5)
            ip = socket.gethostbyname("api.telnyx.com")
            
            return HealthCheckResult(
                component="network",
                status=HealthStatus.HEALTHY,
                message="Network connectivity OK",
                details={"resolved_ip": ip},
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
        except Exception as e:
            return HealthCheckResult(
                component="network",
                status=HealthStatus.UNHEALTHY,
                message=f"Network error: {str(e)}",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
    
    async def check_queue_size(self) -> HealthCheckResult:
        """Check queue sizes."""
        start = datetime.utcnow()
        
        try:
            with self.db_pool.getconn() as conn:
                with conn.cursor() as cur:
                    # Check pending directives
                    cur.execute("""
                        SELECT COUNT(*) 
                        FROM genesis_bridge.directives 
                        WHERE status = 'PENDING'
                    """)
                    pending_directives = cur.fetchone()[0]
                    
                    # Check retry tracking
                    cur.execute("""
                        SELECT COUNT(*) 
                        FROM genesis_bridge.retry_tracking 
                        WHERE status = 'IN_PROGRESS'
                    """)
                    active_retries = cur.fetchone()[0]
                    
                    # Check DLQ
                    cur.execute("""
                        SELECT COUNT(*) 
                        FROM genesis_bridge.dead_letter_queue 
                        WHERE status IN ('PENDING', 'RETRYING')
                    """)
                    dlq_pending = cur.fetchone()[0]
                    
                    # Determine status
                    status = HealthStatus.HEALTHY
                    issues = []
                    
                    if pending_directives > 1000:
                        status = HealthStatus.UNHEALTHY
                        issues.append(f"High pending directives: {pending_directives}")
                    elif pending_directives > 500:
                        status = HealthStatus.DEGRADED
                    
                    if dlq_pending > 100:
                        status = HealthStatus.DEGRADED
                        issues.append(f"High DLQ: {dlq_pending}")
                    
                    return HealthCheckResult(
                        component="queue",
                        status=status,
                        message="; ".join(issues) if issues else "Queue sizes OK",
                        details={
                            "pending_directives": pending_directives,
                            "active_retries": active_retries,
                            "dlq_pending": dlq_pending
                        },
                        response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
                    )
        except Exception as e:
            return HealthCheckResult(
                component="queue",
                status=HealthStatus.UNKNOWN,
                message=f"Failed to check queue: {str(e)}",
                response_time_ms=(datetime.utcnow() - start).total_seconds() * 1000
            )
    
    async def run_all_checks(self) -> Dict[str, HealthCheckResult]:
        """Run all health checks."""
        checks = [
            self.check_postgresql(),
            self.check_telnyx(),
            self.check_claude_code(),
            self.check_system_resources(),
            self.check_network(),
            self.check_queue_size()
        ]
        
        results = {}
        for check in asyncio.as_completed(checks):
            result = await check
            results[result.component] = result
            self._last_checks[result.component] = result
            
            # Store in history
            self._health_history.append(result)
            if len(self._health_history) > self._max_history:
                self._health_history = self._health_history[-self._max_history:]
            
            # Save to database
            self._save_health_check(result)
        
        return results
    
    def _save_health_check(self, result: HealthCheckResult):
        """Save health check result to database."""
        try:
            with self.db_pool.getconn() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        INSERT INTO genesis_bridge.health_check_history 
                        (component, status, message, details, response_time_ms)
                        VALUES (%s, %s, %s, %s, %s)
                    """, (
                        result.component,
                        result.status.value,
                        result.message,
                        psycopg2.extras.Json(result.details),
                        result.response_time_ms
                    ))
                    conn.commit()
        except Exception as e:
            logger.error(f"Failed to save health check: {e}")
    
    async def start_monitoring(self):
        """Start continuous health monitoring."""
        self._running = True
        
        while self._running:
            try:
                logger.info("Running health checks...")
                results = await self.run_all_checks()
                
                overall_status = self.get_overall_status(results)
                logger.info(
                    f"Health check complete: {overall_status.value} - "
                    f"{sum(1 for r in results.values() if r.status == HealthStatus.HEALTHY)}/{len(results)} healthy"
                )
                
            except Exception as e:
                logger.error(f"Health check cycle failed: {e}")
            
            await asyncio.sleep(self.check_interval)
    
    def stop_monitoring(self):
        """Stop health monitoring."""
        self._running = False
    
    def get_overall_status(self, results: Optional[Dict[str, HealthCheckResult]] = None) -> HealthStatus:
        """Get overall system health status."""
        if results is None:
            results = self._last_checks
        
        if not results:
            return HealthStatus.UNKNOWN
        
        statuses = [r.status for r in results.values()]
        
        if any(s == HealthStatus.UNHEALTHY for s in statuses):
            return HealthStatus.UNHEALTHY
        elif any(s == HealthStatus.DEGRADED for s in statuses):
            return HealthStatus.DEGRADED
        elif all(s == HealthStatus.HEALTHY for s in statuses):
            return HealthStatus.HEALTHY
        else:
            return HealthStatus.UNKNOWN
    
    def get_last_checks(self) -> Dict[str, HealthCheckResult]:
        """Get last health check results."""
        return self._last_checks.copy()
    
    def get_health_history(
        self,
        component: Optional[str] = None,
        since: Optional[datetime] = None,
        limit: int = 100
    ) -> List[Dict]:
        """Get health check history."""
        try:
            with self.db_pool.getconn() as conn:
                with conn.cursor() as cur:
                    query = """
                        SELECT component, status, message, details, 
                               response_time_ms, checked_at
                        FROM genesis_bridge.health_check_history
                        WHERE 1=1
                    """
                    params = []
                    
                    if component:
                        query += " AND component = %s"
                        params.append(component)
                    
                    if since:
                        query += " AND checked_at >= %s"
                        params.append(since)
                    
                    query += " ORDER BY checked_at DESC LIMIT %s"
                    params.append(limit)
                    
                    cur.execute(query, params)
                    
                    return [
                        {
                            "component": row[0],
                            "status": row[1],
                            "message": row[2],
                            "details": row[3],
                            "response_time_ms": row[4],
                            "checked_at": row[5].isoformat()
                        }
                        for row in cur.fetchall()
                    ]
        except Exception as e:
            logger.error(f"Failed to get health history: {e}")
            return []
    
    def clear_old_history(self, older_than_days: int = 7):
        """Clear old health check history."""
        try:
            with self.db_pool.getconn() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        DELETE FROM genesis_bridge.health_check_history
                        WHERE checked_at < NOW() - INTERVAL '%s days'
                    """, (older_than_days,))
                    deleted = cur.rowcount
                    conn.commit()
                    
                    logger.info(f"Cleared {deleted} old health check entries")
                    return deleted
        except Exception as e:
            logger.error(f"Failed to clear old history: {e}")
            return 0