#!/usr/bin/env python3
"""
AIVA Queen Chaos Engineering Test Suite
========================================

Comprehensive chaos engineering framework for testing AIVA Queen resilience
under adverse conditions. Implements industry-standard chaos testing patterns
including network failures, latency injection, resource exhaustion, and
dependency failures.

Components:
    - ChaosOrchestrator: Central coordinator for chaos experiments
    - NetworkFaultInjector: Simulates network partitions and failures
    - LatencyInjector: Adds artificial delays to test timeout handling
    - ResourceExhauster: Tests behavior under resource constraints
    - DependencyKiller: Simulates dependency service failures
    - RecoveryValidator: Validates system recovery after chaos

Based on principles from:
    - Netflix Chaos Monkey
    - Gremlin Fault Injection
    - AWS Fault Injection Simulator

Author: Genesis System - AIVA Queen
Version: 1.0.0
"""

from __future__ import annotations

import asyncio
import contextlib
import gc
import hashlib
import json
import logging
import os
import random
import signal
import socket
import sys
import threading
import time
import traceback
import uuid
import weakref
from abc import ABC, abstractmethod
from collections import defaultdict, deque
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from contextlib import asynccontextmanager, contextmanager
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum, auto
from functools import wraps
from pathlib import Path
from typing import (
    Any,
    Callable,
    Coroutine,
    Deque,
    Dict,
    Generic,
    Iterator,
    List,
    Optional,
    Set,
    Tuple,
    Type,
    TypeVar,
    Union,
)
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("aiva.chaos_engineering")

# Type variables
T = TypeVar("T")
R = TypeVar("R")


# =============================================================================
# ENUMS AND CONSTANTS
# =============================================================================


class ChaosType(Enum):
    """Types of chaos that can be injected."""
    NETWORK_PARTITION = "network_partition"
    NETWORK_LATENCY = "network_latency"
    PACKET_LOSS = "packet_loss"
    DEPENDENCY_FAILURE = "dependency_failure"
    RESOURCE_EXHAUSTION = "resource_exhaustion"
    CPU_STRESS = "cpu_stress"
    MEMORY_PRESSURE = "memory_pressure"
    DISK_FULL = "disk_full"
    CLOCK_SKEW = "clock_skew"
    PROCESS_KILL = "process_kill"
    DNS_FAILURE = "dns_failure"
    CERTIFICATE_EXPIRY = "certificate_expiry"
    RATE_LIMIT_EXCEEDED = "rate_limit_exceeded"
    CASCADING_FAILURE = "cascading_failure"


class ChaosState(Enum):
    """State of a chaos experiment."""
    PENDING = "pending"
    RUNNING = "running"
    PAUSED = "paused"
    COMPLETED = "completed"
    FAILED = "failed"
    ROLLED_BACK = "rolled_back"


class RecoveryState(Enum):
    """State of system recovery."""
    NOT_STARTED = "not_started"
    IN_PROGRESS = "in_progress"
    PARTIAL = "partial"
    COMPLETE = "complete"
    FAILED = "failed"


class SeverityLevel(Enum):
    """Severity level of chaos injection."""
    LOW = 1
    MEDIUM = 2
    HIGH = 3
    CRITICAL = 4


# Default configurations
DEFAULT_CHAOS_DURATION = 30.0  # seconds
DEFAULT_RECOVERY_TIMEOUT = 60.0  # seconds
DEFAULT_HEALTH_CHECK_INTERVAL = 5.0  # seconds
MAX_CONCURRENT_CHAOS = 3


# =============================================================================
# DATA CLASSES
# =============================================================================


@dataclass
class ChaosConfig:
    """Configuration for a chaos experiment."""
    chaos_type: ChaosType
    severity: SeverityLevel = SeverityLevel.MEDIUM
    duration_seconds: float = DEFAULT_CHAOS_DURATION
    target_components: List[str] = field(default_factory=list)
    parameters: Dict[str, Any] = field(default_factory=dict)
    rollback_on_failure: bool = True
    collect_metrics: bool = True
    random_seed: Optional[int] = None


@dataclass
class ChaosResult:
    """Result of a chaos experiment."""
    experiment_id: str
    chaos_type: ChaosType
    state: ChaosState
    start_time: float
    end_time: Optional[float] = None
    duration_seconds: float = 0.0
    affected_components: List[str] = field(default_factory=list)
    errors: List[str] = field(default_factory=list)
    metrics: Dict[str, Any] = field(default_factory=dict)
    recovery_result: Optional["RecoveryResult"] = None

    @property
    def was_successful(self) -> bool:
        """Check if chaos was successfully injected and recovered."""
        return (
            self.state in (ChaosState.COMPLETED, ChaosState.ROLLED_BACK) and
            not self.errors
        )


@dataclass
class RecoveryResult:
    """Result of system recovery after chaos."""
    state: RecoveryState
    start_time: float
    end_time: Optional[float] = None
    duration_seconds: float = 0.0
    components_recovered: List[str] = field(default_factory=list)
    components_failed: List[str] = field(default_factory=list)
    health_checks_passed: int = 0
    health_checks_failed: int = 0
    metrics: Dict[str, Any] = field(default_factory=dict)

    @property
    def recovery_percentage(self) -> float:
        """Calculate percentage of components recovered."""
        total = len(self.components_recovered) + len(self.components_failed)
        if total == 0:
            return 100.0
        return (len(self.components_recovered) / total) * 100


@dataclass
class HealthCheckResult:
    """Result of a component health check."""
    component_name: str
    is_healthy: bool
    timestamp: float
    latency_ms: float = 0.0
    error: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)


@dataclass
class ResourceMetrics:
    """Metrics for resource usage during chaos."""
    timestamp: float
    cpu_percent: float = 0.0
    memory_percent: float = 0.0
    memory_bytes: int = 0
    disk_io_read_bytes: int = 0
    disk_io_write_bytes: int = 0
    network_bytes_sent: int = 0
    network_bytes_recv: int = 0
    open_file_descriptors: int = 0
    thread_count: int = 0


# =============================================================================
# MOCK SYSTEM COMPONENTS
# =============================================================================


class MockComponent:
    """Mock AIVA component for chaos testing."""

    def __init__(self, name: str, dependencies: Optional[List[str]] = None):
        self.name = name
        self.dependencies = dependencies or []
        self.is_healthy = True
        self.is_degraded = False
        self.latency_ms = 0.0
        self.failure_rate = 0.0
        self.call_count = 0
        self.error_count = 0
        self._lock = threading.Lock()

    async def call(self, *args, **kwargs) -> Dict[str, Any]:
        """Simulate a call to this component."""
        with self._lock:
            self.call_count += 1

        # Simulate latency
        if self.latency_ms > 0:
            await asyncio.sleep(self.latency_ms / 1000)

        # Simulate failures
        if not self.is_healthy or random.random() < self.failure_rate:
            with self._lock:
                self.error_count += 1
            raise RuntimeError(f"Component {self.name} is unhealthy")

        return {"status": "success", "component": self.name}

    def health_check(self) -> HealthCheckResult:
        """Perform health check on component."""
        start = time.time()
        try:
            is_healthy = self.is_healthy and not self.is_degraded
            latency = (time.time() - start) * 1000
            return HealthCheckResult(
                component_name=self.name,
                is_healthy=is_healthy,
                timestamp=time.time(),
                latency_ms=latency,
            )
        except Exception as e:
            return HealthCheckResult(
                component_name=self.name,
                is_healthy=False,
                timestamp=time.time(),
                error=str(e),
            )

    def inject_failure(self) -> None:
        """Inject a failure into this component."""
        self.is_healthy = False

    def inject_degradation(self, latency_ms: float = 0.0, failure_rate: float = 0.0) -> None:
        """Inject degradation into this component."""
        self.is_degraded = True
        self.latency_ms = latency_ms
        self.failure_rate = failure_rate

    def recover(self) -> None:
        """Recover component to healthy state."""
        self.is_healthy = True
        self.is_degraded = False
        self.latency_ms = 0.0
        self.failure_rate = 0.0

    def get_metrics(self) -> Dict[str, Any]:
        """Get component metrics."""
        with self._lock:
            return {
                "name": self.name,
                "is_healthy": self.is_healthy,
                "is_degraded": self.is_degraded,
                "latency_ms": self.latency_ms,
                "failure_rate": self.failure_rate,
                "call_count": self.call_count,
                "error_count": self.error_count,
                "error_rate": self.error_count / self.call_count if self.call_count > 0 else 0,
            }


class MockAIVAQueen:
    """Mock AIVA Queen orchestrator for chaos testing."""

    def __init__(self):
        self.components: Dict[str, MockComponent] = {}
        self.is_running = False
        self._lock = threading.Lock()
        self._initialize_components()

    def _initialize_components(self) -> None:
        """Initialize mock components."""
        # Core components
        self.components["kernel"] = MockComponent("kernel")
        self.components["memory_cortex"] = MockComponent("memory_cortex", ["kernel"])
        self.components["skill_registry"] = MockComponent("skill_registry", ["kernel"])
        self.components["learning_loop"] = MockComponent("learning_loop", ["memory_cortex"])

        # API and communication
        self.components["api_server"] = MockComponent("api_server", ["kernel", "skill_registry"])
        self.components["event_bus"] = MockComponent("event_bus", ["kernel"])

        # External dependencies
        self.components["database"] = MockComponent("database")
        self.components["redis"] = MockComponent("redis")
        self.components["vector_store"] = MockComponent("vector_store", ["database"])

        # Validation layer
        self.components["validator"] = MockComponent("validator", ["kernel"])

    async def start(self) -> None:
        """Start the mock AIVA Queen."""
        with self._lock:
            self.is_running = True
        logger.info("MockAIVAQueen started")

    async def stop(self) -> None:
        """Stop the mock AIVA Queen."""
        with self._lock:
            self.is_running = False
        logger.info("MockAIVAQueen stopped")

    def get_component(self, name: str) -> Optional[MockComponent]:
        """Get a component by name."""
        return self.components.get(name)

    def get_all_components(self) -> List[MockComponent]:
        """Get all components."""
        return list(self.components.values())

    def get_healthy_components(self) -> List[MockComponent]:
        """Get all healthy components."""
        return [c for c in self.components.values() if c.is_healthy]

    def get_unhealthy_components(self) -> List[MockComponent]:
        """Get all unhealthy components."""
        return [c for c in self.components.values() if not c.is_healthy]

    async def process_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
        """Process a request through the system."""
        if not self.is_running:
            raise RuntimeError("AIVA Queen is not running")

        # Simulate request flow through components
        kernel = self.get_component("kernel")
        if kernel:
            await kernel.call()

        skill_registry = self.get_component("skill_registry")
        if skill_registry:
            await skill_registry.call()

        return {"status": "processed", "request_id": request.get("id", "unknown")}

    def full_health_check(self) -> Dict[str, HealthCheckResult]:
        """Perform health check on all components."""
        results = {}
        for name, component in self.components.items():
            results[name] = component.health_check()
        return results

    def get_system_metrics(self) -> Dict[str, Any]:
        """Get aggregated system metrics."""
        component_metrics = {
            name: comp.get_metrics()
            for name, comp in self.components.items()
        }

        healthy_count = len(self.get_healthy_components())
        total_count = len(self.components)

        return {
            "is_running": self.is_running,
            "healthy_components": healthy_count,
            "total_components": total_count,
            "health_percentage": (healthy_count / total_count * 100) if total_count > 0 else 0,
            "components": component_metrics,
        }


# =============================================================================
# CHAOS INJECTORS
# =============================================================================


class ChaosInjector(ABC):
    """Abstract base class for chaos injectors."""

    def __init__(self, name: str):
        self.name = name
        self._is_active = False
        self._lock = threading.Lock()
        self._metrics: List[Dict[str, Any]] = []

    @property
    def is_active(self) -> bool:
        """Check if injector is currently active."""
        with self._lock:
            return self._is_active

    @abstractmethod
    async def inject(self, config: ChaosConfig, target: Any) -> None:
        """Inject chaos into the target."""
        pass

    @abstractmethod
    async def rollback(self, target: Any) -> None:
        """Rollback chaos injection."""
        pass

    def record_metric(self, metric: Dict[str, Any]) -> None:
        """Record a metric during chaos injection."""
        with self._lock:
            self._metrics.append({
                **metric,
                "timestamp": time.time(),
                "injector": self.name,
            })

    def get_metrics(self) -> List[Dict[str, Any]]:
        """Get recorded metrics."""
        with self._lock:
            return list(self._metrics)

    def clear_metrics(self) -> None:
        """Clear recorded metrics."""
        with self._lock:
            self._metrics.clear()


class NetworkFaultInjector(ChaosInjector):
    """
    Injects network-related faults including:
    - Network partitions
    - Connection timeouts
    - Packet loss
    - Bandwidth throttling
    - DNS resolution failures
    """

    def __init__(self):
        super().__init__("network_fault")
        self._original_states: Dict[str, Dict[str, Any]] = {}
        self._affected_components: List[str] = []

    async def inject(self, config: ChaosConfig, target: MockAIVAQueen) -> None:
        """Inject network fault into target components."""
        with self._lock:
            if self._is_active:
                raise RuntimeError("Network fault injection already active")
            self._is_active = True

        logger.info(f"Injecting network fault: {config.chaos_type.value}")

        try:
            target_components = config.target_components or list(target.components.keys())
            severity_factor = config.severity.value / 4.0  # 0.25 to 1.0

            for comp_name in target_components:
                component = target.get_component(comp_name)
                if not component:
                    continue

                # Save original state
                self._original_states[comp_name] = {
                    "is_healthy": component.is_healthy,
                    "latency_ms": component.latency_ms,
                    "failure_rate": component.failure_rate,
                }

                # Apply network chaos based on type
                if config.chaos_type == ChaosType.NETWORK_PARTITION:
                    # Complete network partition - component unreachable
                    component.inject_failure()

                elif config.chaos_type == ChaosType.NETWORK_LATENCY:
                    # Add latency based on severity
                    base_latency = config.parameters.get("base_latency_ms", 500)
                    jitter = config.parameters.get("jitter_ms", 100)
                    latency = base_latency * severity_factor + random.uniform(0, jitter)
                    component.inject_degradation(latency_ms=latency)

                elif config.chaos_type == ChaosType.PACKET_LOSS:
                    # Inject packet loss as failure rate
                    loss_rate = config.parameters.get("loss_rate", 0.1) * severity_factor
                    component.inject_degradation(failure_rate=loss_rate)

                elif config.chaos_type == ChaosType.DNS_FAILURE:
                    # DNS failure - component can't resolve dependencies
                    if component.dependencies:
                        component.inject_failure()

                self._affected_components.append(comp_name)
                self.record_metric({
                    "event": "fault_injected",
                    "component": comp_name,
                    "chaos_type": config.chaos_type.value,
                    "severity": config.severity.value,
                })

            logger.info(f"Network fault injected into {len(self._affected_components)} components")

        except Exception as e:
            logger.error(f"Failed to inject network fault: {e}")
            await self.rollback(target)
            raise

    async def rollback(self, target: MockAIVAQueen) -> None:
        """Rollback network fault injection."""
        logger.info("Rolling back network fault injection")

        try:
            for comp_name, original_state in self._original_states.items():
                component = target.get_component(comp_name)
                if component:
                    component.is_healthy = original_state["is_healthy"]
                    component.latency_ms = original_state["latency_ms"]
                    component.failure_rate = original_state["failure_rate"]
                    component.is_degraded = False

                    self.record_metric({
                        "event": "fault_rolled_back",
                        "component": comp_name,
                    })

        finally:
            with self._lock:
                self._is_active = False
                self._original_states.clear()
                self._affected_components.clear()

    def get_affected_components(self) -> List[str]:
        """Get list of affected components."""
        with self._lock:
            return list(self._affected_components)


class LatencyInjector(ChaosInjector):
    """
    Injects latency into system operations including:
    - Fixed delay injection
    - Variable/jittery delays
    - Gradual latency increase (brownout)
    - Spike latency patterns
    """

    def __init__(self):
        super().__init__("latency")
        self._original_latencies: Dict[str, float] = {}
        self._latency_task: Optional[asyncio.Task] = None

    async def inject(self, config: ChaosConfig, target: MockAIVAQueen) -> None:
        """Inject latency into target components."""
        with self._lock:
            if self._is_active:
                raise RuntimeError("Latency injection already active")
            self._is_active = True

        logger.info(f"Injecting latency: severity={config.severity.name}")

        try:
            target_components = config.target_components or list(target.components.keys())

            latency_pattern = config.parameters.get("pattern", "fixed")
            base_latency_ms = config.parameters.get("base_latency_ms", 200)
            max_latency_ms = config.parameters.get("max_latency_ms", 2000)

            for comp_name in target_components:
                component = target.get_component(comp_name)
                if not component:
                    continue

                # Save original latency
                self._original_latencies[comp_name] = component.latency_ms

                # Calculate latency based on pattern and severity
                if latency_pattern == "fixed":
                    latency = base_latency_ms * (config.severity.value / 2)

                elif latency_pattern == "jitter":
                    base = base_latency_ms * (config.severity.value / 2)
                    jitter = base * 0.5
                    latency = base + random.uniform(-jitter, jitter)

                elif latency_pattern == "spike":
                    # Random spikes
                    if random.random() < 0.2:  # 20% chance of spike
                        latency = max_latency_ms
                    else:
                        latency = base_latency_ms

                elif latency_pattern == "brownout":
                    # Gradual increase (simulated, actual increase would be over time)
                    latency = base_latency_ms + (max_latency_ms - base_latency_ms) * 0.5
                else:
                    latency = base_latency_ms

                component.latency_ms = latency
                component.is_degraded = True

                self.record_metric({
                    "event": "latency_injected",
                    "component": comp_name,
                    "latency_ms": latency,
                    "pattern": latency_pattern,
                })

            logger.info(f"Latency injected into {len(target_components)} components")

        except Exception as e:
            logger.error(f"Failed to inject latency: {e}")
            await self.rollback(target)
            raise

    async def rollback(self, target: MockAIVAQueen) -> None:
        """Rollback latency injection."""
        logger.info("Rolling back latency injection")

        try:
            if self._latency_task and not self._latency_task.done():
                self._latency_task.cancel()

            for comp_name, original_latency in self._original_latencies.items():
                component = target.get_component(comp_name)
                if component:
                    component.latency_ms = original_latency
                    component.is_degraded = False

                    self.record_metric({
                        "event": "latency_rolled_back",
                        "component": comp_name,
                    })

        finally:
            with self._lock:
                self._is_active = False
                self._original_latencies.clear()
                self._latency_task = None


class ResourceExhauster(ChaosInjector):
    """
    Simulates resource exhaustion scenarios including:
    - Memory pressure
    - CPU stress
    - File descriptor exhaustion
    - Thread pool exhaustion
    - Connection pool exhaustion
    """

    def __init__(self):
        super().__init__("resource_exhaustion")
        self._allocated_memory: List[bytearray] = []
        self._stress_threads: List[threading.Thread] = []
        self._stop_event = threading.Event()
        self._resource_metrics: Deque[ResourceMetrics] = deque(maxlen=100)

    async def inject(self, config: ChaosConfig, target: MockAIVAQueen) -> None:
        """Inject resource exhaustion."""
        with self._lock:
            if self._is_active:
                raise RuntimeError("Resource exhaustion already active")
            self._is_active = True
            self._stop_event.clear()

        logger.info(f"Injecting resource exhaustion: {config.chaos_type.value}")

        try:
            if config.chaos_type == ChaosType.MEMORY_PRESSURE:
                await self._inject_memory_pressure(config)

            elif config.chaos_type == ChaosType.CPU_STRESS:
                await self._inject_cpu_stress(config)

            elif config.chaos_type == ChaosType.RESOURCE_EXHAUSTION:
                # Combined resource exhaustion
                await self._inject_memory_pressure(config)
                await self._inject_cpu_stress(config)

            # Record baseline metrics
            self._record_resource_metrics()

        except Exception as e:
            logger.error(f"Failed to inject resource exhaustion: {e}")
            await self.rollback(target)
            raise

    async def _inject_memory_pressure(self, config: ChaosConfig) -> None:
        """Inject memory pressure by allocating memory."""
        memory_mb = config.parameters.get("memory_mb", 100)
        severity_factor = config.severity.value / 4.0
        target_mb = int(memory_mb * severity_factor)

        logger.info(f"Allocating {target_mb}MB of memory")

        # Allocate memory in chunks
        chunk_size = 10 * 1024 * 1024  # 10MB chunks
        chunks_needed = target_mb // 10 + 1

        for i in range(chunks_needed):
            if self._stop_event.is_set():
                break
            try:
                chunk = bytearray(min(chunk_size, (target_mb * 1024 * 1024) - len(self._allocated_memory) * chunk_size))
                # Touch the memory to ensure it's actually allocated
                for j in range(0, len(chunk), 4096):
                    chunk[j] = 0xFF
                self._allocated_memory.append(chunk)
                self.record_metric({
                    "event": "memory_allocated",
                    "chunk_mb": len(chunk) / (1024 * 1024),
                    "total_allocated_mb": len(self._allocated_memory) * chunk_size / (1024 * 1024),
                })
            except MemoryError:
                logger.warning("Memory allocation limit reached")
                break

    async def _inject_cpu_stress(self, config: ChaosConfig) -> None:
        """Inject CPU stress by spawning worker threads."""
        num_threads = config.parameters.get("cpu_threads", 2)
        severity_factor = config.severity.value / 4.0
        target_threads = max(1, int(num_threads * severity_factor))

        logger.info(f"Starting {target_threads} CPU stress threads")

        def cpu_stress_worker():
            """Worker function that burns CPU."""
            while not self._stop_event.is_set():
                # Busy loop to consume CPU
                _ = sum(i * i for i in range(10000))
                # Brief sleep to allow other threads
                time.sleep(0.001)

        for i in range(target_threads):
            thread = threading.Thread(target=cpu_stress_worker, daemon=True)
            thread.start()
            self._stress_threads.append(thread)

            self.record_metric({
                "event": "cpu_stress_thread_started",
                "thread_id": i,
                "total_threads": len(self._stress_threads),
            })

    def _record_resource_metrics(self) -> None:
        """Record current resource usage metrics."""
        try:
            import psutil
            process = psutil.Process()

            metrics = ResourceMetrics(
                timestamp=time.time(),
                cpu_percent=process.cpu_percent(),
                memory_percent=process.memory_percent(),
                memory_bytes=process.memory_info().rss,
                thread_count=process.num_threads(),
            )
            self._resource_metrics.append(metrics)
        except ImportError:
            # psutil not available, use mock metrics
            metrics = ResourceMetrics(
                timestamp=time.time(),
                memory_bytes=sum(len(chunk) for chunk in self._allocated_memory),
                thread_count=len(self._stress_threads),
            )
            self._resource_metrics.append(metrics)

    async def rollback(self, target: MockAIVAQueen) -> None:
        """Rollback resource exhaustion."""
        logger.info("Rolling back resource exhaustion")

        try:
            # Stop stress threads
            self._stop_event.set()
            for thread in self._stress_threads:
                thread.join(timeout=5.0)

            # Free allocated memory
            self._allocated_memory.clear()
            gc.collect()

            self.record_metric({
                "event": "resources_freed",
                "threads_stopped": len(self._stress_threads),
            })

        finally:
            with self._lock:
                self._is_active = False
                self._stress_threads.clear()

    def get_resource_metrics(self) -> List[ResourceMetrics]:
        """Get recorded resource metrics."""
        return list(self._resource_metrics)


class DependencyKiller(ChaosInjector):
    """
    Simulates dependency service failures including:
    - Complete dependency outage
    - Partial dependency degradation
    - Dependency timeout
    - Cascading failures through dependency chain
    """

    def __init__(self):
        super().__init__("dependency_killer")
        self._killed_dependencies: Dict[str, Dict[str, Any]] = {}
        self._cascade_depth = 0

    async def inject(self, config: ChaosConfig, target: MockAIVAQueen) -> None:
        """Kill specified dependencies."""
        with self._lock:
            if self._is_active:
                raise RuntimeError("Dependency killing already active")
            self._is_active = True

        logger.info(f"Killing dependencies: {config.target_components}")

        try:
            target_deps = config.target_components or ["database", "redis", "vector_store"]
            cascade = config.parameters.get("cascade", False)

            for dep_name in target_deps:
                component = target.get_component(dep_name)
                if not component:
                    continue

                # Save original state
                self._killed_dependencies[dep_name] = {
                    "is_healthy": component.is_healthy,
                    "is_degraded": component.is_degraded,
                    "latency_ms": component.latency_ms,
                    "failure_rate": component.failure_rate,
                }

                # Kill the dependency
                component.inject_failure()

                self.record_metric({
                    "event": "dependency_killed",
                    "dependency": dep_name,
                })

                # Handle cascading failures
                if cascade:
                    await self._cascade_failure(target, dep_name)

            logger.info(f"Killed {len(self._killed_dependencies)} dependencies")

        except Exception as e:
            logger.error(f"Failed to kill dependencies: {e}")
            await self.rollback(target)
            raise

    async def _cascade_failure(self, target: MockAIVAQueen, failed_dep: str) -> None:
        """Propagate failure to dependent components."""
        self._cascade_depth += 1

        if self._cascade_depth > 5:  # Prevent infinite cascade
            return

        for comp in target.get_all_components():
            if failed_dep in comp.dependencies and comp.name not in self._killed_dependencies:
                # Component depends on failed dependency
                self._killed_dependencies[comp.name] = {
                    "is_healthy": comp.is_healthy,
                    "is_degraded": comp.is_degraded,
                    "latency_ms": comp.latency_ms,
                    "failure_rate": comp.failure_rate,
                }

                # Degrade or kill based on criticality
                if random.random() < 0.5:
                    comp.inject_failure()
                else:
                    comp.inject_degradation(failure_rate=0.5)

                self.record_metric({
                    "event": "cascade_failure",
                    "component": comp.name,
                    "caused_by": failed_dep,
                    "depth": self._cascade_depth,
                })

    async def rollback(self, target: MockAIVAQueen) -> None:
        """Rollback dependency killing."""
        logger.info("Rolling back dependency kills")

        try:
            for dep_name, original_state in self._killed_dependencies.items():
                component = target.get_component(dep_name)
                if component:
                    component.is_healthy = original_state["is_healthy"]
                    component.is_degraded = original_state["is_degraded"]
                    component.latency_ms = original_state["latency_ms"]
                    component.failure_rate = original_state["failure_rate"]

                    self.record_metric({
                        "event": "dependency_restored",
                        "dependency": dep_name,
                    })

        finally:
            with self._lock:
                self._is_active = False
                self._killed_dependencies.clear()
                self._cascade_depth = 0

    def get_killed_dependencies(self) -> List[str]:
        """Get list of killed dependencies."""
        with self._lock:
            return list(self._killed_dependencies.keys())


# =============================================================================
# RECOVERY VALIDATOR
# =============================================================================


class RecoveryValidator:
    """
    Validates system recovery after chaos injection.

    Performs comprehensive health checks, monitors recovery progress,
    and validates that all components return to healthy state.
    """

    def __init__(
        self,
        health_check_interval: float = DEFAULT_HEALTH_CHECK_INTERVAL,
        recovery_timeout: float = DEFAULT_RECOVERY_TIMEOUT,
    ):
        self._health_check_interval = health_check_interval
        self._recovery_timeout = recovery_timeout
        self._health_history: Deque[Dict[str, HealthCheckResult]] = deque(maxlen=100)

    async def validate_recovery(
        self,
        target: MockAIVAQueen,
        expected_components: Optional[List[str]] = None,
    ) -> RecoveryResult:
        """
        Validate that system recovers to healthy state.

        Args:
            target: The AIVA Queen instance to validate
            expected_components: Components expected to recover

        Returns:
            RecoveryResult with detailed recovery information
        """
        result = RecoveryResult(
            state=RecoveryState.IN_PROGRESS,
            start_time=time.time(),
        )

        expected = expected_components or list(target.components.keys())
        remaining = set(expected)

        logger.info(f"Starting recovery validation for {len(expected)} components")

        start_time = time.time()

        while remaining and (time.time() - start_time) < self._recovery_timeout:
            # Perform health checks
            health_results = target.full_health_check()
            self._health_history.append(health_results)

            for comp_name in list(remaining):
                check_result = health_results.get(comp_name)
                if check_result and check_result.is_healthy:
                    remaining.remove(comp_name)
                    result.components_recovered.append(comp_name)
                    result.health_checks_passed += 1
                    logger.info(f"Component {comp_name} recovered")
                else:
                    result.health_checks_failed += 1

            if not remaining:
                break

            await asyncio.sleep(self._health_check_interval)

        # Finalize result
        result.end_time = time.time()
        result.duration_seconds = result.end_time - result.start_time
        result.components_failed = list(remaining)

        if not remaining:
            result.state = RecoveryState.COMPLETE
            logger.info("Full recovery validated successfully")
        elif len(remaining) < len(expected):
            result.state = RecoveryState.PARTIAL
            logger.warning(f"Partial recovery: {len(remaining)} components still unhealthy")
        else:
            result.state = RecoveryState.FAILED
            logger.error(f"Recovery failed: {len(remaining)} components unhealthy")

        result.metrics = {
            "total_expected": len(expected),
            "recovered": len(result.components_recovered),
            "failed": len(result.components_failed),
            "recovery_percentage": result.recovery_percentage,
            "total_health_checks": result.health_checks_passed + result.health_checks_failed,
        }

        return result

    async def monitor_health_during_chaos(
        self,
        target: MockAIVAQueen,
        duration_seconds: float,
    ) -> List[Dict[str, HealthCheckResult]]:
        """Monitor system health during chaos injection."""
        results = []
        start_time = time.time()

        while (time.time() - start_time) < duration_seconds:
            health_results = target.full_health_check()
            results.append(health_results)
            await asyncio.sleep(self._health_check_interval)

        return results

    def get_health_history(self) -> List[Dict[str, HealthCheckResult]]:
        """Get recorded health check history."""
        return list(self._health_history)

    def calculate_availability(self) -> Dict[str, float]:
        """Calculate component availability from health history."""
        if not self._health_history:
            return {}

        component_stats: Dict[str, Dict[str, int]] = defaultdict(lambda: {"healthy": 0, "total": 0})

        for health_snapshot in self._health_history:
            for comp_name, result in health_snapshot.items():
                component_stats[comp_name]["total"] += 1
                if result.is_healthy:
                    component_stats[comp_name]["healthy"] += 1

        return {
            comp_name: stats["healthy"] / stats["total"] if stats["total"] > 0 else 0
            for comp_name, stats in component_stats.items()
        }


# =============================================================================
# CHAOS ORCHESTRATOR
# =============================================================================


class ChaosOrchestrator:
    """
    Central orchestrator for chaos engineering experiments.

    Coordinates multiple chaos injectors, manages experiment lifecycle,
    and ensures proper cleanup and recovery validation.
    """

    def __init__(
        self,
        target: MockAIVAQueen,
        recovery_validator: Optional[RecoveryValidator] = None,
    ):
        self.target = target
        self.recovery_validator = recovery_validator or RecoveryValidator()

        # Initialize injectors
        self.injectors: Dict[ChaosType, ChaosInjector] = {
            ChaosType.NETWORK_PARTITION: NetworkFaultInjector(),
            ChaosType.NETWORK_LATENCY: LatencyInjector(),
            ChaosType.PACKET_LOSS: NetworkFaultInjector(),
            ChaosType.DNS_FAILURE: NetworkFaultInjector(),
            ChaosType.MEMORY_PRESSURE: ResourceExhauster(),
            ChaosType.CPU_STRESS: ResourceExhauster(),
            ChaosType.RESOURCE_EXHAUSTION: ResourceExhauster(),
            ChaosType.DEPENDENCY_FAILURE: DependencyKiller(),
            ChaosType.CASCADING_FAILURE: DependencyKiller(),
        }

        self._active_experiments: Dict[str, ChaosResult] = {}
        self._completed_experiments: List[ChaosResult] = []
        self._lock = threading.Lock()

    async def run_experiment(
        self,
        config: ChaosConfig,
        validate_recovery: bool = True,
    ) -> ChaosResult:
        """
        Run a chaos experiment.

        Args:
            config: Configuration for the chaos experiment
            validate_recovery: Whether to validate recovery after chaos

        Returns:
            ChaosResult with experiment details
        """
        experiment_id = str(uuid.uuid4())[:8]

        result = ChaosResult(
            experiment_id=experiment_id,
            chaos_type=config.chaos_type,
            state=ChaosState.PENDING,
            start_time=time.time(),
        )

        with self._lock:
            if len(self._active_experiments) >= MAX_CONCURRENT_CHAOS:
                raise RuntimeError(f"Maximum concurrent chaos experiments ({MAX_CONCURRENT_CHAOS}) exceeded")
            self._active_experiments[experiment_id] = result

        logger.info(f"Starting chaos experiment {experiment_id}: {config.chaos_type.value}")

        injector = self.injectors.get(config.chaos_type)
        if not injector:
            result.state = ChaosState.FAILED
            result.errors.append(f"No injector for chaos type: {config.chaos_type}")
            return result

        try:
            # Inject chaos
            result.state = ChaosState.RUNNING
            await injector.inject(config, self.target)

            # Wait for chaos duration
            await asyncio.sleep(config.duration_seconds)

            # Collect metrics during chaos
            if hasattr(injector, "get_affected_components"):
                result.affected_components = injector.get_affected_components()
            result.metrics["injector_metrics"] = injector.get_metrics()

            # Rollback chaos
            await injector.rollback(self.target)
            result.state = ChaosState.ROLLED_BACK

            # Validate recovery
            if validate_recovery:
                recovery_result = await self.recovery_validator.validate_recovery(
                    self.target,
                    result.affected_components or None,
                )
                result.recovery_result = recovery_result

                if recovery_result.state == RecoveryState.COMPLETE:
                    result.state = ChaosState.COMPLETED
                else:
                    result.errors.append(f"Recovery incomplete: {recovery_result.components_failed}")

        except Exception as e:
            logger.error(f"Chaos experiment failed: {e}")
            result.state = ChaosState.FAILED
            result.errors.append(str(e))
            traceback.print_exc()

            # Attempt rollback on failure
            if config.rollback_on_failure:
                try:
                    await injector.rollback(self.target)
                except Exception as rollback_error:
                    result.errors.append(f"Rollback failed: {rollback_error}")

        finally:
            result.end_time = time.time()
            result.duration_seconds = result.end_time - result.start_time

            with self._lock:
                del self._active_experiments[experiment_id]
                self._completed_experiments.append(result)

        logger.info(
            f"Chaos experiment {experiment_id} completed: "
            f"state={result.state.value}, duration={result.duration_seconds:.2f}s"
        )

        return result

    async def run_scenario(
        self,
        configs: List[ChaosConfig],
        sequential: bool = True,
    ) -> List[ChaosResult]:
        """
        Run a scenario of multiple chaos experiments.

        Args:
            configs: List of chaos configurations
            sequential: If True, run sequentially; otherwise run in parallel

        Returns:
            List of ChaosResult for each experiment
        """
        results = []

        if sequential:
            for config in configs:
                result = await self.run_experiment(config)
                results.append(result)
        else:
            tasks = [self.run_experiment(config) for config in configs]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            # Convert exceptions to failed results
            results = [
                r if isinstance(r, ChaosResult) else ChaosResult(
                    experiment_id="error",
                    chaos_type=ChaosType.NETWORK_PARTITION,
                    state=ChaosState.FAILED,
                    start_time=time.time(),
                    errors=[str(r)],
                )
                for r in results
            ]

        return results

    def get_active_experiments(self) -> Dict[str, ChaosResult]:
        """Get currently active experiments."""
        with self._lock:
            return dict(self._active_experiments)

    def get_completed_experiments(self) -> List[ChaosResult]:
        """Get completed experiments."""
        with self._lock:
            return list(self._completed_experiments)

    def generate_report(self) -> Dict[str, Any]:
        """Generate a summary report of all experiments."""
        with self._lock:
            experiments = list(self._completed_experiments)

        if not experiments:
            return {"message": "No experiments completed"}

        successful = [e for e in experiments if e.was_successful]
        failed = [e for e in experiments if not e.was_successful]

        total_duration = sum(e.duration_seconds for e in experiments)

        chaos_type_stats = defaultdict(lambda: {"count": 0, "success": 0, "failed": 0})
        for exp in experiments:
            chaos_type_stats[exp.chaos_type.value]["count"] += 1
            if exp.was_successful:
                chaos_type_stats[exp.chaos_type.value]["success"] += 1
            else:
                chaos_type_stats[exp.chaos_type.value]["failed"] += 1

        return {
            "total_experiments": len(experiments),
            "successful": len(successful),
            "failed": len(failed),
            "success_rate": len(successful) / len(experiments) * 100 if experiments else 0,
            "total_duration_seconds": total_duration,
            "average_duration_seconds": total_duration / len(experiments) if experiments else 0,
            "chaos_type_breakdown": dict(chaos_type_stats),
            "recovery_stats": {
                "full_recovery": sum(1 for e in experiments if e.recovery_result and e.recovery_result.state == RecoveryState.COMPLETE),
                "partial_recovery": sum(1 for e in experiments if e.recovery_result and e.recovery_result.state == RecoveryState.PARTIAL),
                "failed_recovery": sum(1 for e in experiments if e.recovery_result and e.recovery_result.state == RecoveryState.FAILED),
            },
        }


# =============================================================================
# PYTEST FIXTURES
# =============================================================================


@pytest.fixture
def mock_aiva_queen():
    """Provide a mock AIVA Queen instance for testing."""
    queen = MockAIVAQueen()
    yield queen


@pytest.fixture
async def running_aiva_queen(mock_aiva_queen):
    """Provide a running AIVA Queen instance."""
    await mock_aiva_queen.start()
    yield mock_aiva_queen
    await mock_aiva_queen.stop()


@pytest.fixture
def chaos_orchestrator(mock_aiva_queen):
    """Provide a chaos orchestrator instance."""
    return ChaosOrchestrator(mock_aiva_queen)


@pytest.fixture
def recovery_validator():
    """Provide a recovery validator instance."""
    return RecoveryValidator(
        health_check_interval=0.5,
        recovery_timeout=10.0,
    )


@pytest.fixture
def network_fault_injector():
    """Provide a network fault injector."""
    return NetworkFaultInjector()


@pytest.fixture
def latency_injector():
    """Provide a latency injector."""
    return LatencyInjector()


@pytest.fixture
def resource_exhauster():
    """Provide a resource exhauster."""
    return ResourceExhauster()


@pytest.fixture
def dependency_killer():
    """Provide a dependency killer."""
    return DependencyKiller()


# =============================================================================
# TEST CASES
# =============================================================================


class TestChaosOrchestrator:
    """Tests for the ChaosOrchestrator."""

    @pytest.mark.asyncio
    async def test_orchestrator_initialization(self, mock_aiva_queen):
        """Test chaos orchestrator initializes correctly."""
        orchestrator = ChaosOrchestrator(mock_aiva_queen)

        assert orchestrator.target == mock_aiva_queen
        assert len(orchestrator.injectors) > 0
        assert orchestrator.recovery_validator is not None

    @pytest.mark.asyncio
    async def test_run_simple_experiment(self, chaos_orchestrator):
        """Test running a simple chaos experiment."""
        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_LATENCY,
            severity=SeverityLevel.LOW,
            duration_seconds=1.0,
            parameters={"base_latency_ms": 100},
        )

        result = await chaos_orchestrator.run_experiment(config)

        assert result.experiment_id is not None
        assert result.chaos_type == ChaosType.NETWORK_LATENCY
        assert result.state in (ChaosState.COMPLETED, ChaosState.ROLLED_BACK)
        assert result.duration_seconds >= 1.0

    @pytest.mark.asyncio
    async def test_run_experiment_with_recovery_validation(self, chaos_orchestrator):
        """Test experiment with recovery validation."""
        config = ChaosConfig(
            chaos_type=ChaosType.DEPENDENCY_FAILURE,
            severity=SeverityLevel.MEDIUM,
            duration_seconds=1.0,
            target_components=["database"],
        )

        result = await chaos_orchestrator.run_experiment(config, validate_recovery=True)

        assert result.recovery_result is not None
        assert result.recovery_result.state in (
            RecoveryState.COMPLETE,
            RecoveryState.PARTIAL,
            RecoveryState.FAILED,
        )

    @pytest.mark.asyncio
    async def test_run_scenario_sequential(self, chaos_orchestrator):
        """Test running a sequential chaos scenario."""
        configs = [
            ChaosConfig(
                chaos_type=ChaosType.NETWORK_LATENCY,
                duration_seconds=0.5,
            ),
            ChaosConfig(
                chaos_type=ChaosType.DEPENDENCY_FAILURE,
                duration_seconds=0.5,
                target_components=["redis"],
            ),
        ]

        results = await chaos_orchestrator.run_scenario(configs, sequential=True)

        assert len(results) == 2
        assert all(r.experiment_id is not None for r in results)

    @pytest.mark.asyncio
    async def test_experiment_rollback_on_failure(self, mock_aiva_queen):
        """Test that experiments rollback properly on failure."""
        # Create orchestrator with a mock that will fail during injection
        orchestrator = ChaosOrchestrator(mock_aiva_queen)

        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_PARTITION,
            duration_seconds=0.5,
            rollback_on_failure=True,
        )

        result = await orchestrator.run_experiment(config)

        # Verify system recovered
        health = mock_aiva_queen.full_health_check()
        healthy_count = sum(1 for h in health.values() if h.is_healthy)
        assert healthy_count == len(mock_aiva_queen.components)

    @pytest.mark.asyncio
    async def test_generate_report(self, chaos_orchestrator):
        """Test report generation after experiments."""
        configs = [
            ChaosConfig(chaos_type=ChaosType.NETWORK_LATENCY, duration_seconds=0.5),
            ChaosConfig(chaos_type=ChaosType.DEPENDENCY_FAILURE, duration_seconds=0.5),
        ]

        await chaos_orchestrator.run_scenario(configs, sequential=True)

        report = chaos_orchestrator.generate_report()

        assert report["total_experiments"] == 2
        assert "success_rate" in report
        assert "chaos_type_breakdown" in report


class TestNetworkFaultInjector:
    """Tests for NetworkFaultInjector."""

    @pytest.mark.asyncio
    async def test_inject_network_partition(self, mock_aiva_queen, network_fault_injector):
        """Test network partition injection."""
        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_PARTITION,
            severity=SeverityLevel.HIGH,
            target_components=["database", "redis"],
        )

        await network_fault_injector.inject(config, mock_aiva_queen)

        assert network_fault_injector.is_active

        # Verify components are unhealthy
        db = mock_aiva_queen.get_component("database")
        redis = mock_aiva_queen.get_component("redis")
        assert not db.is_healthy
        assert not redis.is_healthy

        await network_fault_injector.rollback(mock_aiva_queen)
        assert not network_fault_injector.is_active

    @pytest.mark.asyncio
    async def test_inject_network_latency(self, mock_aiva_queen, network_fault_injector):
        """Test network latency injection."""
        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_LATENCY,
            severity=SeverityLevel.MEDIUM,
            parameters={"base_latency_ms": 500},
        )

        await network_fault_injector.inject(config, mock_aiva_queen)

        # Verify latency was added
        kernel = mock_aiva_queen.get_component("kernel")
        assert kernel.latency_ms > 0
        assert kernel.is_degraded

        await network_fault_injector.rollback(mock_aiva_queen)
        assert kernel.latency_ms == 0

    @pytest.mark.asyncio
    async def test_inject_packet_loss(self, mock_aiva_queen, network_fault_injector):
        """Test packet loss injection."""
        config = ChaosConfig(
            chaos_type=ChaosType.PACKET_LOSS,
            severity=SeverityLevel.MEDIUM,
            parameters={"loss_rate": 0.3},
            target_components=["api_server"],
        )

        await network_fault_injector.inject(config, mock_aiva_queen)

        api_server = mock_aiva_queen.get_component("api_server")
        assert api_server.failure_rate > 0

        await network_fault_injector.rollback(mock_aiva_queen)
        assert api_server.failure_rate == 0

    @pytest.mark.asyncio
    async def test_double_injection_fails(self, mock_aiva_queen, network_fault_injector):
        """Test that double injection raises error."""
        config = ChaosConfig(chaos_type=ChaosType.NETWORK_PARTITION)

        await network_fault_injector.inject(config, mock_aiva_queen)

        with pytest.raises(RuntimeError, match="already active"):
            await network_fault_injector.inject(config, mock_aiva_queen)

        await network_fault_injector.rollback(mock_aiva_queen)


class TestLatencyInjector:
    """Tests for LatencyInjector."""

    @pytest.mark.asyncio
    async def test_inject_fixed_latency(self, mock_aiva_queen, latency_injector):
        """Test fixed latency pattern injection."""
        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_LATENCY,
            severity=SeverityLevel.MEDIUM,
            parameters={"pattern": "fixed", "base_latency_ms": 300},
            target_components=["kernel", "memory_cortex"],
        )

        await latency_injector.inject(config, mock_aiva_queen)

        kernel = mock_aiva_queen.get_component("kernel")
        memory = mock_aiva_queen.get_component("memory_cortex")

        assert kernel.latency_ms > 0
        assert memory.latency_ms > 0

        await latency_injector.rollback(mock_aiva_queen)

    @pytest.mark.asyncio
    async def test_inject_jitter_latency(self, mock_aiva_queen, latency_injector):
        """Test jitter latency pattern."""
        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_LATENCY,
            severity=SeverityLevel.HIGH,
            parameters={"pattern": "jitter", "base_latency_ms": 200},
        )

        await latency_injector.inject(config, mock_aiva_queen)

        # Collect latencies - they should vary with jitter
        latencies = [
            comp.latency_ms
            for comp in mock_aiva_queen.get_all_components()
        ]

        # At least some latency should be injected
        assert any(l > 0 for l in latencies)

        await latency_injector.rollback(mock_aiva_queen)

    @pytest.mark.asyncio
    async def test_rollback_restores_original_latency(self, mock_aiva_queen, latency_injector):
        """Test that rollback restores original latency values."""
        # Set initial latency
        kernel = mock_aiva_queen.get_component("kernel")
        original_latency = 50.0
        kernel.latency_ms = original_latency

        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_LATENCY,
            parameters={"base_latency_ms": 500},
            target_components=["kernel"],
        )

        await latency_injector.inject(config, mock_aiva_queen)
        assert kernel.latency_ms != original_latency

        await latency_injector.rollback(mock_aiva_queen)
        assert kernel.latency_ms == original_latency


class TestResourceExhauster:
    """Tests for ResourceExhauster."""

    @pytest.mark.asyncio
    async def test_inject_memory_pressure(self, mock_aiva_queen, resource_exhauster):
        """Test memory pressure injection."""
        config = ChaosConfig(
            chaos_type=ChaosType.MEMORY_PRESSURE,
            severity=SeverityLevel.LOW,
            parameters={"memory_mb": 10},  # Small allocation for testing
        )

        await resource_exhauster.inject(config, mock_aiva_queen)

        assert resource_exhauster.is_active
        assert len(resource_exhauster._allocated_memory) > 0

        await resource_exhauster.rollback(mock_aiva_queen)

        assert not resource_exhauster.is_active
        assert len(resource_exhauster._allocated_memory) == 0

    @pytest.mark.asyncio
    async def test_inject_cpu_stress(self, mock_aiva_queen, resource_exhauster):
        """Test CPU stress injection."""
        config = ChaosConfig(
            chaos_type=ChaosType.CPU_STRESS,
            severity=SeverityLevel.LOW,
            parameters={"cpu_threads": 1},
        )

        await resource_exhauster.inject(config, mock_aiva_queen)

        assert resource_exhauster.is_active
        assert len(resource_exhauster._stress_threads) > 0

        # Let it run briefly
        await asyncio.sleep(0.2)

        await resource_exhauster.rollback(mock_aiva_queen)

        assert not resource_exhauster.is_active
        assert len(resource_exhauster._stress_threads) == 0

    @pytest.mark.asyncio
    async def test_resource_metrics_collection(self, mock_aiva_queen, resource_exhauster):
        """Test that resource metrics are collected."""
        config = ChaosConfig(
            chaos_type=ChaosType.MEMORY_PRESSURE,
            severity=SeverityLevel.LOW,
            parameters={"memory_mb": 5},
        )

        await resource_exhauster.inject(config, mock_aiva_queen)

        metrics = resource_exhauster.get_resource_metrics()
        assert len(metrics) > 0
        assert metrics[0].timestamp > 0

        await resource_exhauster.rollback(mock_aiva_queen)


class TestDependencyKiller:
    """Tests for DependencyKiller."""

    @pytest.mark.asyncio
    async def test_kill_single_dependency(self, mock_aiva_queen, dependency_killer):
        """Test killing a single dependency."""
        config = ChaosConfig(
            chaos_type=ChaosType.DEPENDENCY_FAILURE,
            target_components=["database"],
        )

        await dependency_killer.inject(config, mock_aiva_queen)

        db = mock_aiva_queen.get_component("database")
        assert not db.is_healthy

        await dependency_killer.rollback(mock_aiva_queen)

        assert db.is_healthy

    @pytest.mark.asyncio
    async def test_kill_multiple_dependencies(self, mock_aiva_queen, dependency_killer):
        """Test killing multiple dependencies."""
        config = ChaosConfig(
            chaos_type=ChaosType.DEPENDENCY_FAILURE,
            target_components=["database", "redis", "vector_store"],
        )

        await dependency_killer.inject(config, mock_aiva_queen)

        killed = dependency_killer.get_killed_dependencies()
        assert len(killed) == 3
        assert "database" in killed
        assert "redis" in killed

        await dependency_killer.rollback(mock_aiva_queen)

        # All should be healthy again
        for comp_name in killed:
            comp = mock_aiva_queen.get_component(comp_name)
            assert comp.is_healthy

    @pytest.mark.asyncio
    async def test_cascading_failure(self, mock_aiva_queen, dependency_killer):
        """Test cascading failure propagation."""
        config = ChaosConfig(
            chaos_type=ChaosType.CASCADING_FAILURE,
            target_components=["kernel"],
            parameters={"cascade": True},
        )

        await dependency_killer.inject(config, mock_aiva_queen)

        # Should have killed more than just the kernel
        killed = dependency_killer.get_killed_dependencies()
        assert len(killed) >= 1

        # Check for cascade in metrics
        metrics = dependency_killer.get_metrics()
        cascade_events = [m for m in metrics if m.get("event") == "cascade_failure"]
        # Cascade may or may not happen depending on dependencies

        await dependency_killer.rollback(mock_aiva_queen)


class TestRecoveryValidator:
    """Tests for RecoveryValidator."""

    @pytest.mark.asyncio
    async def test_validate_full_recovery(self, mock_aiva_queen, recovery_validator):
        """Test validation of full system recovery."""
        # All components start healthy
        result = await recovery_validator.validate_recovery(mock_aiva_queen)

        assert result.state == RecoveryState.COMPLETE
        assert result.recovery_percentage == 100.0
        assert len(result.components_failed) == 0

    @pytest.mark.asyncio
    async def test_validate_partial_recovery(self, mock_aiva_queen, recovery_validator):
        """Test validation of partial recovery."""
        # Kill one component that won't recover
        db = mock_aiva_queen.get_component("database")
        db.inject_failure()

        # Short timeout to ensure quick test
        recovery_validator._recovery_timeout = 1.0
        recovery_validator._health_check_interval = 0.2

        result = await recovery_validator.validate_recovery(
            mock_aiva_queen,
            expected_components=["database", "kernel"],
        )

        assert result.state in (RecoveryState.PARTIAL, RecoveryState.FAILED)
        assert "database" in result.components_failed

        # Cleanup
        db.recover()

    @pytest.mark.asyncio
    async def test_health_history_tracking(self, mock_aiva_queen, recovery_validator):
        """Test that health history is tracked."""
        await recovery_validator.validate_recovery(mock_aiva_queen)

        history = recovery_validator.get_health_history()
        assert len(history) > 0

    @pytest.mark.asyncio
    async def test_calculate_availability(self, mock_aiva_queen, recovery_validator):
        """Test availability calculation."""
        # Run a few health checks
        for _ in range(3):
            health = mock_aiva_queen.full_health_check()
            recovery_validator._health_history.append(health)

        availability = recovery_validator.calculate_availability()

        assert "kernel" in availability
        assert all(0.0 <= v <= 1.0 for v in availability.values())


class TestMockAIVAQueen:
    """Tests for the MockAIVAQueen class itself."""

    @pytest.mark.asyncio
    async def test_queen_lifecycle(self, mock_aiva_queen):
        """Test AIVA Queen start/stop lifecycle."""
        assert not mock_aiva_queen.is_running

        await mock_aiva_queen.start()
        assert mock_aiva_queen.is_running

        await mock_aiva_queen.stop()
        assert not mock_aiva_queen.is_running

    @pytest.mark.asyncio
    async def test_process_request_when_running(self, running_aiva_queen):
        """Test request processing when queen is running."""
        request = {"id": "test-123", "action": "test"}

        result = await running_aiva_queen.process_request(request)

        assert result["status"] == "processed"
        assert result["request_id"] == "test-123"

    @pytest.mark.asyncio
    async def test_process_request_when_stopped(self, mock_aiva_queen):
        """Test that request processing fails when stopped."""
        request = {"id": "test-123"}

        with pytest.raises(RuntimeError, match="not running"):
            await mock_aiva_queen.process_request(request)

    def test_component_access(self, mock_aiva_queen):
        """Test component access methods."""
        kernel = mock_aiva_queen.get_component("kernel")
        assert kernel is not None
        assert kernel.name == "kernel"

        nonexistent = mock_aiva_queen.get_component("nonexistent")
        assert nonexistent is None

    def test_health_check_all_healthy(self, mock_aiva_queen):
        """Test full health check with all healthy components."""
        results = mock_aiva_queen.full_health_check()

        assert len(results) == len(mock_aiva_queen.components)
        assert all(r.is_healthy for r in results.values())

    def test_health_check_with_unhealthy(self, mock_aiva_queen):
        """Test health check with some unhealthy components."""
        kernel = mock_aiva_queen.get_component("kernel")
        kernel.inject_failure()

        results = mock_aiva_queen.full_health_check()

        assert not results["kernel"].is_healthy
        assert results["memory_cortex"].is_healthy

        kernel.recover()

    def test_system_metrics(self, mock_aiva_queen):
        """Test system metrics aggregation."""
        metrics = mock_aiva_queen.get_system_metrics()

        assert "is_running" in metrics
        assert "healthy_components" in metrics
        assert "total_components" in metrics
        assert "health_percentage" in metrics
        assert "components" in metrics


class TestMockComponent:
    """Tests for MockComponent."""

    @pytest.mark.asyncio
    async def test_component_call_success(self):
        """Test successful component call."""
        component = MockComponent("test")

        result = await component.call()

        assert result["status"] == "success"
        assert result["component"] == "test"
        assert component.call_count == 1
        assert component.error_count == 0

    @pytest.mark.asyncio
    async def test_component_call_with_failure(self):
        """Test component call when unhealthy."""
        component = MockComponent("test")
        component.inject_failure()

        with pytest.raises(RuntimeError, match="unhealthy"):
            await component.call()

        assert component.error_count == 1

    @pytest.mark.asyncio
    async def test_component_call_with_latency(self):
        """Test component call with latency injection."""
        component = MockComponent("test")
        component.inject_degradation(latency_ms=100)

        start = time.time()
        await component.call()
        elapsed = (time.time() - start) * 1000

        assert elapsed >= 90  # Allow some variance

    @pytest.mark.asyncio
    async def test_component_failure_rate(self):
        """Test component failure rate injection."""
        component = MockComponent("test")
        component.inject_degradation(failure_rate=0.5)

        successes = 0
        failures = 0

        for _ in range(100):
            try:
                await component.call()
                successes += 1
            except RuntimeError:
                failures += 1

        # With 50% failure rate, expect roughly equal split
        assert 20 < failures < 80

    def test_component_recovery(self):
        """Test component recovery from failure."""
        component = MockComponent("test")
        component.inject_failure()
        assert not component.is_healthy

        component.recover()
        assert component.is_healthy
        assert not component.is_degraded
        assert component.latency_ms == 0
        assert component.failure_rate == 0


class TestChaosScenarios:
    """Integration tests for complete chaos scenarios."""

    @pytest.mark.asyncio
    async def test_database_outage_scenario(self, chaos_orchestrator):
        """Test system behavior during database outage."""
        config = ChaosConfig(
            chaos_type=ChaosType.DEPENDENCY_FAILURE,
            severity=SeverityLevel.CRITICAL,
            duration_seconds=2.0,
            target_components=["database"],
        )

        result = await chaos_orchestrator.run_experiment(config)

        assert result.was_successful
        assert "database" in result.affected_components or len(result.affected_components) >= 0

    @pytest.mark.asyncio
    async def test_network_degradation_scenario(self, chaos_orchestrator):
        """Test system behavior during network degradation."""
        configs = [
            ChaosConfig(
                chaos_type=ChaosType.NETWORK_LATENCY,
                severity=SeverityLevel.LOW,
                duration_seconds=1.0,
                parameters={"base_latency_ms": 100},
            ),
            ChaosConfig(
                chaos_type=ChaosType.NETWORK_LATENCY,
                severity=SeverityLevel.MEDIUM,
                duration_seconds=1.0,
                parameters={"base_latency_ms": 500},
            ),
            ChaosConfig(
                chaos_type=ChaosType.NETWORK_LATENCY,
                severity=SeverityLevel.HIGH,
                duration_seconds=1.0,
                parameters={"base_latency_ms": 1000},
            ),
        ]

        results = await chaos_orchestrator.run_scenario(configs, sequential=True)

        assert len(results) == 3
        # All should have completed or rolled back
        assert all(r.state in (ChaosState.COMPLETED, ChaosState.ROLLED_BACK) for r in results)

    @pytest.mark.asyncio
    async def test_multi_component_failure_scenario(self, chaos_orchestrator):
        """Test system behavior during multi-component failure."""
        config = ChaosConfig(
            chaos_type=ChaosType.DEPENDENCY_FAILURE,
            severity=SeverityLevel.HIGH,
            duration_seconds=1.5,
            target_components=["database", "redis", "event_bus"],
        )

        result = await chaos_orchestrator.run_experiment(config)

        # System should still recover
        assert result.recovery_result is not None
        assert result.recovery_result.recovery_percentage > 0

    @pytest.mark.asyncio
    async def test_gradual_degradation_scenario(self, chaos_orchestrator):
        """Test gradual system degradation and recovery."""
        # Start with low severity, increase gradually
        for severity in [SeverityLevel.LOW, SeverityLevel.MEDIUM, SeverityLevel.HIGH]:
            config = ChaosConfig(
                chaos_type=ChaosType.NETWORK_LATENCY,
                severity=severity,
                duration_seconds=0.5,
                parameters={"pattern": "brownout"},
            )

            result = await chaos_orchestrator.run_experiment(config)
            assert result.was_successful

        # Final state should be healthy
        health = chaos_orchestrator.target.get_system_metrics()
        assert health["health_percentage"] == 100


class TestChaosMetrics:
    """Tests for chaos metrics collection and reporting."""

    @pytest.mark.asyncio
    async def test_injector_metrics_collection(self, mock_aiva_queen, network_fault_injector):
        """Test that injectors collect metrics."""
        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_PARTITION,
            target_components=["kernel"],
        )

        await network_fault_injector.inject(config, mock_aiva_queen)
        await network_fault_injector.rollback(mock_aiva_queen)

        metrics = network_fault_injector.get_metrics()

        assert len(metrics) >= 2  # inject and rollback events
        assert any(m["event"] == "fault_injected" for m in metrics)
        assert any(m["event"] == "fault_rolled_back" for m in metrics)

    @pytest.mark.asyncio
    async def test_experiment_result_metrics(self, chaos_orchestrator):
        """Test that experiment results contain metrics."""
        config = ChaosConfig(
            chaos_type=ChaosType.NETWORK_LATENCY,
            duration_seconds=0.5,
            collect_metrics=True,
        )

        result = await chaos_orchestrator.run_experiment(config)

        assert "injector_metrics" in result.metrics

    @pytest.mark.asyncio
    async def test_report_generation_accuracy(self, chaos_orchestrator):
        """Test that generated reports are accurate."""
        # Run a known set of experiments
        configs = [
            ChaosConfig(chaos_type=ChaosType.NETWORK_LATENCY, duration_seconds=0.3),
            ChaosConfig(chaos_type=ChaosType.NETWORK_LATENCY, duration_seconds=0.3),
            ChaosConfig(chaos_type=ChaosType.DEPENDENCY_FAILURE, duration_seconds=0.3),
        ]

        await chaos_orchestrator.run_scenario(configs, sequential=True)

        report = chaos_orchestrator.generate_report()

        assert report["total_experiments"] == 3
        assert "network_latency" in report["chaos_type_breakdown"]
        assert report["chaos_type_breakdown"]["network_latency"]["count"] == 2


# =============================================================================
# CLI INTERFACE
# =============================================================================


def main():
    """Command-line interface for chaos engineering tests."""
    import argparse

    parser = argparse.ArgumentParser(description="AIVA Queen Chaos Engineering Test Suite")
    parser.add_argument(
        "command",
        choices=["test", "demo", "report"],
        help="Command to run",
    )
    parser.add_argument(
        "--chaos-type",
        type=str,
        default="network_latency",
        help="Type of chaos to inject",
    )
    parser.add_argument(
        "--severity",
        type=str,
        choices=["low", "medium", "high", "critical"],
        default="medium",
        help="Severity level",
    )
    parser.add_argument(
        "--duration",
        type=float,
        default=5.0,
        help="Chaos duration in seconds",
    )

    args = parser.parse_args()

    if args.command == "test":
        print("Running chaos engineering tests...")
        pytest.main([__file__, "-v", "-x"])

    elif args.command == "demo":
        print("=== Chaos Engineering Demo ===\n")

        async def run_demo():
            queen = MockAIVAQueen()
            await queen.start()

            orchestrator = ChaosOrchestrator(queen)

            severity_map = {
                "low": SeverityLevel.LOW,
                "medium": SeverityLevel.MEDIUM,
                "high": SeverityLevel.HIGH,
                "critical": SeverityLevel.CRITICAL,
            }

            chaos_type_map = {
                "network_latency": ChaosType.NETWORK_LATENCY,
                "network_partition": ChaosType.NETWORK_PARTITION,
                "dependency_failure": ChaosType.DEPENDENCY_FAILURE,
                "memory_pressure": ChaosType.MEMORY_PRESSURE,
            }

            config = ChaosConfig(
                chaos_type=chaos_type_map.get(args.chaos_type, ChaosType.NETWORK_LATENCY),
                severity=severity_map.get(args.severity, SeverityLevel.MEDIUM),
                duration_seconds=args.duration,
            )

            print(f"Chaos Type: {config.chaos_type.value}")
            print(f"Severity: {config.severity.name}")
            print(f"Duration: {config.duration_seconds}s\n")

            result = await orchestrator.run_experiment(config)

            print(f"\n=== Experiment Result ===")
            print(f"ID: {result.experiment_id}")
            print(f"State: {result.state.value}")
            print(f"Duration: {result.duration_seconds:.2f}s")
            print(f"Affected Components: {result.affected_components}")

            if result.recovery_result:
                print(f"\n=== Recovery Result ===")
                print(f"State: {result.recovery_result.state.value}")
                print(f"Recovery %: {result.recovery_result.recovery_percentage:.1f}%")
                print(f"Duration: {result.recovery_result.duration_seconds:.2f}s")

            await queen.stop()

        asyncio.run(run_demo())

    elif args.command == "report":
        print("=== Chaos Engineering Report ===\n")
        print("Run 'test' command first to generate data.")
        print("Reports are available via chaos_orchestrator.generate_report()")


if __name__ == "__main__":
    main()
