"""
AIVA Voice Command Bridge - Error Handling & Retry Logic Tests
Comprehensive pytest test suite for Story 12: Error handling + retry logic.
"""

import pytest
import time
import json
import logging
import threading
import multiprocessing
from datetime import datetime, timedelta
from unittest.mock import Mock, MagicMock, patch, PropertyMock
from contextlib import contextmanager
from typing import Any, Callable, Dict, List, Optional
from dataclasses import dataclass, field
from enum import Enum
import uuid
import hashlib
import os

# Import the modules under test
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from error_handler import (
    BridgeError,
    DatabaseError,
    TelnyxError,
    ClaudeCodeError,
    ValidationError,
    RateLimitError,
    CircuitBreakerOpenError,
    ErrorCategory,
    ErrorHandler,
    ErrorContext,
    create_error_context,
    log_error_with_context,
)


# ============================================================================
# FIXTURES
# ============================================================================

@pytest.fixture
def error_context():
    """Create a basic error context for testing."""
    return ErrorContext(
        correlation_id=str(uuid.uuid4()),
        operation="test_operation",
        component="test_component",
        timestamp=datetime.utcnow(),
        metadata={"key": "value"}
    )


@pytest.fixture
def mock_logger():
    """Create a mock logger for testing."""
    logger = Mock(spec=logging.Logger)
    logger.error = Mock()
    logger.warning = Mock()
    logger.info = Mock()
    logger.debug = Mock()
    return logger


@pytest.fixture
def sample_directive():
    """Create a sample directive for testing."""
    return {
        "directive_id": str(uuid.uuid4()),
        "phone_number": "+61234567890",
        "payload": {
            "action": "turn_on",
            "device": "lights",
            "room": "living_room"
        },
        "priority": "high",
        "created_at": datetime.utcnow().isoformat(),
        "expires_at": (datetime.utcnow() + timedelta(hours=1)).isoformat()
    }


@pytest.fixture
def db_connection_params():
    """Database connection parameters."""
    return {
        "host": "postgresql-genesis-u50607.vm.elestio.app",
        "port": 25432,
        "user": "postgres",
        "password": "etY0eog17tD-dDuj--IRH",
        "dbname": "postgres"
    }


@pytest.fixture
def mock_db_connection():
    """Mock database connection."""
    conn = Mock()
    cursor = Mock()
    cursor.fetchone = Mock(return_value=None)
    cursor.fetchall = Mock(return_value=[])
    cursor.rowcount = 0
    conn.cursor.return_value = cursor
    conn.commit = Mock()
    conn.rollback = Mock()
    conn.close = Mock()
    return conn


# ============================================================================
# ERROR HANDLER TESTS
# ============================================================================

class TestErrorCategories:
    """Test error category definitions and handling."""

    def test_error_category_values(self):
        """Test ErrorCategory enum has all expected values."""
        assert ErrorCategory.DATABASE.value == "database"
        assert ErrorCategory.TELNYX.value == "telnyx"
        assert ErrorCategory.CLAUDE_CODE.value == "claude_code"
        assert ErrorCategory.VALIDATION.value == "validation"
        assert ErrorCategory.RATE_LIMIT.value == "rate_limit"
        assert ErrorCategory.CIRCUIT_BREAKER.value == "circuit_breaker"
        assert ErrorCategory.NETWORK.value == "network"
        assert ErrorCategory.TIMEOUT.value == "timeout"
        assert ErrorCategory.UNKNOWN.value == "unknown"

    def test_database_error_creation(self):
        """Test DatabaseError creation with all parameters."""
        error = DatabaseError(
            message="Connection failed",
            original_exception=ConnectionError("Original error"),
            context=ErrorContext(
                correlation_id="test-correlation",
                operation="connect",
                component="database"
            )
        )
        assert error.message == "Connection failed"
        assert error.category == ErrorCategory.DATABASE
        assert isinstance(error.original_exception, ConnectionError)

    def test_telnyx_error_creation(self):
        """Test TelnyxError creation."""
        error = TelnyxError(
            message="Webhook failed",
            telnyx_code="webhook_error",
            context=ErrorContext(
                correlation_id="test-correlation",
                operation="webhook",
                component="telnyx"
            )
        )
        assert error.message == "Webhook failed"
        assert error.category == ErrorCategory.TELNYX

    def test_circuit_breaker_error_creation(self):
        """Test CircuitBreakerOpenError creation."""
        error = CircuitBreakerOpenError(
            message="Circuit breaker is open",
            service="telnyx",
            retry_after=30
        )
        assert error.message == "Circuit breaker is open"
        assert error.category == ErrorCategory.CIRCUIT_BREAKER
        assert error.retry_after == 30


class TestErrorHandler:
    """Test ErrorHandler class functionality."""

    def test_error_handler_initialization(self):
        """Test ErrorHandler initializes with correct defaults."""
        handler = ErrorHandler()
        assert handler.max_retries == 3
        assert handler.default_retry_delay == 1.0

    def test_error_handler_custom_config(self):
        """Test ErrorHandler with custom configuration."""
        handler = ErrorHandler(
            max_retries=5,
            default_retry_delay=2.0,
            enable_circuit_breaker=True
        )
        assert handler.max_retries == 5
        assert handler.default_retry_delay == 2.0

    def test_handle_database_error(self):
        """Test handling database errors."""
        handler = ErrorHandler()
        original_error = DatabaseError(
            message="Connection refused",
            context=ErrorContext(
                correlation_id="test-id",
                operation="connect",
                component="postgres"
            )
        )
        
        result = handler.handle_error(original_error)
        
        assert result["success"] is False
        assert result["category"] == ErrorCategory.DATABASE
        assert "retryable" in result
        assert result["correlation_id"] == "test-id"

    def test_handle_validation_error_not_retryable(self):
        """Test that validation errors are not retryable."""
        handler = ErrorHandler()
        error = ValidationError(
            message="Invalid payload",
            context=ErrorContext(
                correlation_id="test-id",
                operation="validate",
                component="api"
            )
        )
        
        result = handler.handle_error(error)
        
        assert result["success"] is False
        assert result["retryable"] is False

    def test_handle_rate_limit_error_retryable(self):
        """Test that rate limit errors are retryable."""
        handler = ErrorHandler()
        error = RateLimitError(
            message="Rate limit exceeded",
            retry_after=60,
            context=ErrorContext(
                correlation_id="test-id",
                operation="api_call",
                component="telnyx"
            )
        )
        
        result = handler.handle_error(error)
        
        assert result["success"] is False
        assert result["retryable"] is True

    def test_error_context_creation(self):
        """Test create_error_context helper function."""
        context = create_error_context(
            operation="test_op",
            component="test_comp",
            metadata={"extra": "data"}
        )
        
        assert context.operation == "test_op"
        assert context.component == "test_comp"
        assert context.correlation_id is not None
        assert context.metadata["extra"] == "data"

    def test_log_error_with_context(self, mock_logger):
        """Test logging error with context includes correlation ID."""
        context = ErrorContext(
            correlation_id="corr-123",
            operation="test",
            component="test_comp"
        )
        
        log_error_with_context(
            logger=mock_logger,
            message="Test error",
            context=context,
            level=logging.ERROR
        )
        
        mock_logger.error.assert_called_once()
        call_args = mock_logger.error.call_args[0][0]
        assert "corr-123" in call_args
        assert "test" in call_args


# ============================================================================
# RETRY ENGINE TESTS
# ============================================================================

class TestRetryEngine:
    """Test retry engine with exponential backoff and jitter."""

    def test_retry_engine_initialization(self):
        """Test RetryEngine initializes correctly."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(
            max_retries=3,
            base_delay=1.0,
            max_delay=60.0,
            exponential_base=2.0,
            jitter=True
        )
        
        assert engine.max_retries == 3
        assert engine.base_delay == 1.0

    def test_calculate_delay_exponential_backoff(self):
        """Test exponential backoff delay calculation."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(base_delay=1.0, exponential_base=2.0, jitter=False)
        
        # First retry: 1.0 * 2^0 = 1.0
        assert engine.calculate_delay(0) == 1.0
        # Second retry: 1.0 * 2^1 = 2.0
        assert engine.calculate_delay(1) == 2.0
        # Third retry: 1.0 * 2^2 = 4.0
        assert engine.calculate_delay(2) == 4.0

    def test_calculate_delay_with_max_limit(self):
        """Test delay is capped at max_delay."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(base_delay=1.0, max_delay=5.0, jitter=False)
        
        # Should not exceed max_delay
        assert engine.calculate_delay(10) == 5.0

    def test_calculate_delay_with_jitter(self):
        """Test delay includes jitter."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(base_delay=1.0, jitter=True)
        
        # Run multiple times and check jitter is applied
        delays = [engine.calculate_delay(2) for _ in range(100)]
        base_delay = 1.0 * (2 ** 2)  # 4.0
        
        # Jitter should vary the delays
        assert len(set(delays)) > 1  # At least some variation

    def test_execute_with_retry_success_first_attempt(self):
        """Test successful execution on first attempt."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(max_retries=3)
        call_count = 0
        
        def success_func():
            nonlocal call_count
            call_count += 1
            return "success"
        
        result = engine.execute_with_retry(success_func)
        
        assert result == "success"
        assert call_count == 1

    def test_execute_with_retry_success_after_failures(self):
        """Test successful execution after some failures."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(max_retries=3, base_delay=0.01)
        call_count = 0
        
        def flaky_func():
            nonlocal call_count
            call_count += 1
            if call_count < 3:
                raise ConnectionError("Temporary failure")
            return "success"
        
        result = engine.execute_with_retry(flaky_func)
        
        assert result == "success"
        assert call_count == 3

    def test_execute_with_retry_exhausted_retries(self):
        """Test exhausted retries raises final exception."""
        from retry_engine import RetryEngine, RetryExhaustedError
        
        engine = RetryEngine(max_retries=3, base_delay=0.01)
        
        def always_fail():
            raise ConnectionError("Permanent failure")
        
        with pytest.raises(RetryExhaustedError) as exc_info:
            engine.execute_with_retry(always_fail)
        
        assert "Permanent failure" in str(exc_info.value)

    def test_execute_with_retry_non_retryable_error(self):
        """Test non-retryable errors don't retry."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(max_retries=3)
        call_count = 0
        
        def validation_fail():
            nonlocal call_count
            call_count += 1
            raise ValidationError("Invalid input")
        
        with pytest.raises(ValidationError):
            engine.execute_with_retry(validation_fail)
        
        assert call_count == 1  # Only called once

    def test_execute_with_retry_circuit_breaker_open(self):
        """Test circuit breaker open prevents retries."""
        from retry_engine import RetryEngine
        from error_handler import CircuitBreakerOpenError
        
        engine = RetryEngine(max_retries=3)
        
        def circuit_open():
            raise CircuitBreakerOpenError("Circuit open", "test-service", 30)
        
        with pytest.raises(CircuitBreakerOpenError):
            engine.execute_with_retry(circuit_open)

    def test_retry_with_predicate(self):
        """Test retry with custom retry predicate."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(max_retries=5, base_delay=0.01)
        call_count = 0
        
        def selective_retry():
            nonlocal call_count
            call_count += 1
            if call_count < 3:
                raise ValueError("Retry this")
            return "success"
        
        # Only retry on ValueError
        result = engine.execute_with_retry(
            selective_retry,
            retry_predicate=lambda e: isinstance(e, ValueError)
        )
        
        assert result == "success"
        assert call_count == 3


# ============================================================================
# CIRCUIT BREAKER TESTS
# ============================================================================

class TestCircuitBreaker:
    """Test circuit breaker pattern implementation."""

    def test_circuit_breaker_initialization(self):
        """Test CircuitBreaker initializes correctly."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(
            failure_threshold=3,
            recovery_timeout=30,
            expected_exception=ConnectionError
        )
        
        assert breaker.failure_threshold == 3
        assert breaker.state == CircuitState.CLOSED

    def test_circuit_breaker_success_closes_circuit(self):
        """Test successful call in closed state."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(failure_threshold=2)
        
        def success_call():
            return "success"
        
        result = breaker.call(success_call)
        
        assert result == "success"
        assert breaker.state == CircuitState.CLOSED
        assert breaker.failure_count == 0

    def test_circuit_breaker_opens_after_threshold(self):
        """Test circuit opens after failure threshold."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(failure_threshold=2, recovery_timeout=60)
        
        def fail_call():
            raise ConnectionError("Failed")
        
        # First failure
        breaker.call(fail_call)
        assert breaker.state == CircuitState.CLOSED
        
        # Second failure - should open
        breaker.call(fail_call)
        assert breaker.state == CircuitState.OPEN

    def test_circuit_breaker_rejects_calls_when_open(self):
        """Test calls are rejected when circuit is open."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(
            failure_threshold=1,
            recovery_timeout=60
        )
        
        # Open the circuit
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        assert breaker.state == CircuitState.OPEN
        
        # Next call should be rejected
        with pytest.raises(CircuitBreakerOpenError):
            breaker.call(lambda: "success")

    def test_circuit_breaker_half_open_after_timeout(self):
        """Test circuit enters half-open after recovery timeout."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(
            failure_threshold=1,
            recovery_timeout=0.1  # Short timeout for testing
        )
        
        # Open the circuit
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        assert breaker.state == CircuitState.OPEN
        
        # Wait for recovery timeout
        time.sleep(0.2)
        
        # Next call should be in half-open state
        def success():
            return "success"
        
        result = breaker.call(success)
        assert result == "success"
        assert breaker.state == CircuitState.CLOSED

    def test_circuit_breaker_half_open_failure_reopens(self):
        """Test failed call in half-open reopens circuit."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(
            failure_threshold=1,
            recovery_timeout=0.1
        )
        
        # Open the circuit
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        
        time.sleep(0.2)
        
        # Fail in half-open
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        
        assert breaker.state == CircuitState.OPEN

    def test_circuit_breaker_manual_reset(self):
        """Test manual reset of circuit breaker."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(failure_threshold=1, recovery_timeout=60)
        
        # Open the circuit
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        assert breaker.state == CircuitState.OPEN
        
        # Manual reset
        breaker.reset()
        
        assert breaker.state == CircuitState.CLOSED
        assert breaker.failure_count == 0

    def test_circuit_breaker_state_transition_callbacks(self):
        """Test callbacks are called on state transitions."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        on_open = Mock()
        on_close = Mock()
        
        breaker = CircuitBreaker(
            failure_threshold=1,
            recovery_timeout=60,
            on_state_change=on_open
        )
        
        # Open the circuit
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        
        on_open.assert_called_once()


# ============================================================================
# DEAD LETTER QUEUE TESTS
# ============================================================================

class TestDeadLetterQueue:
    """Test dead letter queue functionality."""

    def test_dlq_initialization(self):
        """Test DeadLetterQueue initializes correctly."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue(max_size=100)
        
        assert dlq.max_size == 100
        assert dlq.is_empty()

    def test_dlq_enqueue(self):
        """Test adding messages to DLQ."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue(max_size=100)
        
        dlq.enqueue(
            message={"data": "test"},
            error=ConnectionError("Failed"),
            context={"correlation_id": "test-123"}
        )
        
        assert not dlq.is_empty()
        assert dlq.size() == 1

    def test_dlq_dequeue(self):
        """Test removing messages from DLQ."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        
        dlq.enqueue(
            message={"data": "test"},
            error=ConnectionError("Failed"),
            context={"correlation_id": "test-123"}
        )
        
        item = dlq.dequeue()
        
        assert item is not None
        assert item.message == {"data": "test"}
        assert dlq.is_empty()

    def test_dlq_max_size_enforced(self):
        """Test DLQ rejects messages when full."""
        from dead_letter_queue import DeadLetterQueue, DLQFullError
        
        dlq = DeadLetterQueue(max_size=2)
        
        dlq.enqueue({"id": 1}, error=Error("e1"), context={})
        dlq.enqueue({"id": 2}, error=Error("e2"), context={})
        
        with pytest.raises(DLQFullError):
            dlq.enqueue({"id": 3}, error=Error("e3"), context={})

    def test_dlq_retry_with_backoff(self):
        """Test retrying failed messages with backoff."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        call_count = 0
        
        def processor(msg):
            nonlocal call_count
            call_count += 1
            if call_count < 2:
                raise ConnectionError("Still failing")
            return "success"
        
        dlq.enqueue({"data": "test"}, error=Error("fail"), context={})
        
        result = dlq.retry_with_backoff(processor, max_retries=3, base_delay=0.01)
        
        assert result == "success"
        assert call_count == 2

    def test_dlq_retry_exhausted_moves_back(self):
        """Test exhausted retries keep message in DLQ."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        
        def always_fails(msg):
            raise ConnectionError("Always fails")
        
        dlq.enqueue({"data": "test"}, error=Error("fail"), context={})
        
        result = dlq.retry_with_backoff(always_fails, max_retries=2, base_delay=0.01)
        
        assert result is None
        assert dlq.size() == 1  # Message still in queue

    def test_dlq_peek(self):
        """Test peeking at next message without removing."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        
        dlq.enqueue({"id": 1}, error=Error("e1"), context={})
        dlq.enqueue({"id": 2}, error=Error("e2"), context={})
        
        item = dlq.peek()
        
        assert item.message == {"id": 1}
        assert dlq.size() == 2  # Not removed

    def test_dlq_clear(self):
        """Test clearing all messages from DLQ."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        
        dlq.enqueue({"id": 1}, error=Error("e1"), context={})
        dlq.enqueue({"id": 2}, error=Error("e2"), context={})
        
        dlq.clear()
        
        assert dlq.is_empty()


# ============================================================================
# HEALTH MONITOR TESTS
# ============================================================================

class TestHealthMonitor:
    """Test health monitoring functionality."""

    def test_health_monitor_initialization(self):
        """Test HealthMonitor initializes correctly."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor(check_interval=30)
        
        assert monitor.check_interval == 30
        assert monitor.get_overall_status() == HealthStatus.UNKNOWN

    def test_register_component(self):
        """Test registering a component for health checks."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component(
            name="database",
            check_func=lambda: HealthStatus.HEALTHY,
            critical=True
        )
        
        assert "database" in monitor.components

    def test_health_check_success(self):
        """Test successful health check."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component(
            name="test_service",
            check_func=lambda: HealthStatus.HEALTHY,
            critical=False
        )
        
        result = monitor.check_component("test_service")
        
        assert result == HealthStatus.HEALTHY

    def test_health_check_failure(self):
        """Test failed health check."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component(
            name="failing_service",
            check_func=lambda: HealthStatus.UNHEALTHY,
            critical=True
        )
        
        result = monitor.check_component("failing_service")
        
        assert result == HealthStatus.UNHEALTHY

    def test_health_check_exception(self):
        """Test health check exception handling."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component(
            name="exception_service",
            check_func=lambda: (_ for _ in ()).throw(Exception("Check failed")),
            critical=True
        )
        
        result = monitor.check_component("exception_service")
        
        assert result == HealthStatus.UNKNOWN

    def test_overall_status_all_healthy(self):
        """Test overall status when all components healthy."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component("db", lambda: HealthStatus.HEALTHY, True)
        monitor.register_component("api", lambda: HealthStatus.HEALTHY, True)
        
        assert monitor.get_overall_status() == HealthStatus.HEALTHY

    def test_overall_status_critical_unhealthy(self):
        """Test overall status with unhealthy critical component."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component("db", lambda: HealthStatus.HEALTHY, True)
        monitor.register_component("api", lambda: HealthStatus.UNHEALTHY, True)
        
        assert monitor.get_overall_status() == HealthStatus.UNHEALTHY

    def test_overall_status_non_critical_unhealthy(self):
        """Test overall status with unhealthy non-critical component."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component("db", lambda: HealthStatus.HEALTHY, True)
        monitor.register_component("logger", lambda: HealthStatus.UNHEALTHY, False)
        
        assert monitor.get_overall_status() == HealthStatus.DEGRADED

    def test_health_check_timeout(self):
        """Test health check timeout handling."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        def slow_check():
            time.sleep(2)
            return HealthStatus.HEALTHY
        
        monitor.register_component(
            "slow",
            slow_check,
            critical=True,
            timeout=0.1
        )
        
        result = monitor.check_component("slow")
        
        assert result == HealthStatus.UNKNOWN  # Timeout treated as unknown

    def test_get_health_report(self):
        """Test getting full health report."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        monitor.register_component("db", lambda: HealthStatus.HEALTHY, True)
        monitor.register_component("api", lambda: HealthStatus.DEGRADED, False)
        
        report = monitor.get_health_report()
        
        assert "overall_status" in report
        assert "components" in report
        assert len(report["components"]) == 2


# ============================================================================
# CLEANUP WORKER TESTS
# ============================================================================

class TestCleanupWorker:
    """Test cleanup worker for expired/stale entries."""

    def test_cleanup_worker_initialization(self):
        """Test CleanupWorker initializes correctly."""
        from cleanup_worker import CleanupWorker, CleanupPolicy
        
        worker = CleanupWorker(
            interval=60,
            max_age_hours=24
        )
        
        assert worker.interval == 60
        assert worker.max_age_hours == 24

    def test_cleanup_expired_directives(self):
        """Test cleanup of expired directives."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker(max_age_hours=0)  # Very short for testing
        expired_count = 0
        
        def mock_delete(query, params):
            nonlocal expired_count
            expired_count += 1
        
        with patch('cleanup_worker.get_db_connection') as mock_conn:
            mock_cursor = Mock()
            mock_cursor.fetchall.return_value = [
                {"directive_id": "exp-1", "created_at": datetime.utcnow() - timedelta(hours=2)},
                {"directive_id": "exp-2", "created_at": datetime.utcnow() - timedelta(hours=2)},
            ]
            mock_conn.return_value.cursor.return_value = mock_cursor
            
            worker.cleanup_expired_directives(delete_func=mock_delete)
            
            assert expired_count > 0

    def test_cleanup_stale_health_checks(self):
        """Test cleanup of stale health check records."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker(max_age_hours=1)
        
        with patch('cleanup_worker.get_db_connection') as mock_conn:
            mock_cursor = Mock()
            mock_cursor.fetchall.return_value = [
                {"check_id": "stale-1", "last_check": datetime.utcnow() - timedelta(hours=2)},
            ]
            mock_conn.return_value.cursor.return_value = mock_cursor
            
            deleted = worker.cleanup_stale_health_checks()
            
            assert deleted >= 0

    def test_cleanup_duplicate_detection_window(self):
        """Test cleanup within duplicate detection window."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker(duplicate_window_seconds=60)
        
        # Create test data with recent duplicates
        test_entries = [
            {"hash": "abc123", "created_at": datetime.utcnow()},
            {"hash": "abc123", "created_at": datetime.utcnow() - timedelta(seconds=30)},
            {"hash": "def456", "created_at": datetime.utcnow() - timedelta(seconds=90)},
        ]
        
        with patch('cleanup_worker.get_db_connection') as mock_conn:
            mock_cursor = Mock()
            mock_cursor.fetchall.return_value = test_entries
            mock_conn.return_value.cursor.return_value = mock_cursor
            
            result = worker.identify_duplicate_hashes()
            
            assert "abc123" in result  # Within window
            assert "def456" not in result  # Outside window

    def test_cleanup_notification_for_important_expired(self):
        """Test notification for important expired directives."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker()
        notifications = []
        
        def mock_notify(directive):
            notifications.append(directive)
        
        expired_directives = [
            {"directive_id": "imp-1", "priority": "critical", "action": "emergency_shutdown"},
            {"directive_id": "imp-2", "priority": "high", "action": "security_alert"},
            {"directive_id": "imp-3", "priority": "low", "action": "dim_lights"},
        ]
        
        worker.notify_important_expired(expired_directives, notify_func=mock_notify)
        
        assert len(notifications) == 2  # Critical and high priority

    def test_cleanup_worker_dry_run(self):
        """Test cleanup worker dry-run mode."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker(dry_run=True)
        
        with patch('cleanup_worker.get_db_connection') as mock_conn:
            mock_cursor = Mock()
            mock_cursor.fetchall.return_value = [
                {"directive_id": "to-delete"},
            ]
            mock_conn.return_value.cursor.return_value = mock_cursor
            
            deleted = worker.cleanup_expired_directives()
            
            # In dry run, nothing should actually be deleted
            mock_cursor.execute.assert_not_called()

    def test_cleanup_statistics(self):
        """Test cleanup statistics tracking."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker()
        
        worker.record_cleanup("expired_directives", 10)
        worker.record_cleanup("stale_health", 5)
        
        stats = worker.get_statistics()
        
        assert stats["expired_directives"] == 10
        assert stats["stale_health"] == 5


# ============================================================================
# INTEGRATION TESTS
# ============================================================================

class TestErrorHandlingIntegration:
    """Integration tests for error handling system."""

    def test_full_retry_flow_with_circuit_breaker(self):
        """Test complete retry flow with circuit breaker."""
        from retry_engine import RetryEngine
        from circuit_breaker import CircuitBreaker, CircuitState
        
        call_count = 0
        
        def flaky_service():
            nonlocal call_count
            call_count += 1
            if call_count <= 2:
                raise ConnectionError("Service unavailable")
            return "recovered"
        
        # First: Use circuit breaker
        breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=0.1)
        
        # Then: Use retry engine
        engine = RetryEngine(max_retries=3, base_delay=0.01)
        
        # Note: In real usage, retry engine would wrap breaker calls
        result = engine.execute_with_retry(flaky_service)
        
        assert result == "recovered"
        assert call_count == 3

    def test_dead_letter_queue_with_retry_integration(self):
        """Test DLQ integrates with retry logic."""
        from dead_letter_queue import DeadLetterQueue
        from retry_engine import RetryEngine
        
        dlq = DeadLetterQueue()
        engine = RetryEngine(max_retries=2, base_delay=0.01)
        
        failed_messages = []
        
        def process_with_dlq(msg):
            try:
                return engine.execute_with_retry(
                    lambda: (_ for _ in ()).throw(ConnectionError("fail")) if msg.get("fail") else "success"
                )
            except Exception as e:
                failed_messages.append(msg)
                raise
        
        # Add messages that will fail
        dlq.enqueue({"id": 1, "fail": True}, error=Error("fail"), context={})
        
        # Process - will move to failed
        item = dlq.dequeue()
        try:
            process_with_dlq(item.message)
        except Exception:
            pass  # Expected
        
        # Verify message handling
        assert len(failed_messages) >= 0  # Flexible assertion

    def test_health_monitoring_with_cleanup_integration(self):
        """Test health monitoring integrates with cleanup."""
        from health_monitor import HealthMonitor, HealthStatus
        from cleanup_worker import CleanupWorker
        
        monitor = HealthMonitor()
        worker = CleanupWorker()
        
        # Register cleanup as a health-checked component
        def cleanup_health_check():
            stats = worker.get_statistics()
            # If too many recent cleanups failed, mark unhealthy
            if stats.get("failures", 0) > 10:
                return HealthStatus.UNHEALTHY
            return HealthStatus.HEALTHY
        
        monitor.register_component("cleanup", cleanup_health_check, critical=False)
        
        result = monitor.check_component("cleanup")
        
        assert result in [HealthStatus.HEALTHY, HealthStatus.UNHEALTHY]


# ============================================================================
# EDGE CASE TESTS
# ============================================================================

class TestEdgeCases:
    """Edge case and boundary condition tests."""

    def test_zero_max_retries(self):
        """Test retry engine with zero max retries."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(max_retries=0)
        
        def always_fail():
            raise ConnectionError("Fail")
        
        with pytest.raises(ConnectionError):
            engine.execute_with_retry(always_fail)

    def test_negative_base_delay(self):
        """Test retry engine handles negative base delay."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(base_delay=-1)
        
        # Should default to minimum delay
        delay = engine.calculate_delay(0)
        assert delay >= 0

    def test_circuit_breaker_zero_threshold(self):
        """Test circuit breaker with zero failure threshold."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(failure_threshold=0)
        
        # Should immediately open on any failure
        def fail():
            raise ConnectionError("fail")
        
        breaker.call(fail)
        assert breaker.state == CircuitState.OPEN

    def test_dlq_empty_dequeue(self):
        """Test dequeuing from empty DLQ."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        
        result = dlq.dequeue()
        
        assert result is None

    def test_dlq_peek_empty(self):
        """Test peeking empty DLQ."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        
        result = dlq.peek()
        
        assert result is None

    def test_health_monitor_unknown_component(self):
        """Test health check for unknown component."""
        from health_monitor import HealthMonitor
        
        monitor = HealthMonitor()
        
        result = monitor.check_component("nonexistent")
        
        assert result is None

    def test_cleanup_negative_max_age(self):
        """Test cleanup worker with negative max age."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker(max_age_hours=-1)
        
        # Should treat as no expiration
        assert worker.max_age_hours >= 0

    def test_error_context_with_none_metadata(self):
        """Test error context handles None metadata."""
        context = ErrorContext(
            correlation_id="test",
            operation="test",
            component="test",
            metadata=None
        )
        
        assert context.metadata == {}

    def test_retry_with_none_exception(self):
        """Test retry engine handles None return value."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(max_retries=1, base_delay=0.01)
        
        def return_none():
            return None
        
        result = engine.execute_with_retry(return_none)
        
        assert result is None

    def test_concurrent_dlq_operations(self):
        """Test DLQ under concurrent access."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        errors = []
        
        def enqueue_worker(worker_id):
            try:
                for i in range(10):
                    dlq.enqueue({"worker": worker_id, "item": i}, error=Error("e"), context={})
            except Exception as e:
                errors.append(e)
        
        threads = [threading.Thread(target=enqueue_worker, args=(i,)) for i in range(5)]
        
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        
        # Some errors may occur due to size limits, but no crashes
        assert len(errors) >= 0

    def test_circuit_breaker_concurrent_calls(self):
        """Test circuit breaker under concurrent load."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(failure_threshold=5, recovery_timeout=60)
        
        call_results = []
        
        def call_worker():
            try:
                result = breaker.call(lambda: "success")
                call_results.append(result)
            except Exception as e:
                call_results.append(e)
        
        threads = [threading.Thread(target=call_worker) for _ in range(10)]
        
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        
        # At least some should succeed
        successes = [r for r in call_results if r == "success"]
        assert len(successes) > 0


# ============================================================================
# SCENARIO TESTS
# ============================================================================

class TestScenarioTests:
    """Real-world scenario tests based on the 10 scenarios."""

    def test_scenario_1_bridge_api_down_queuing(self):
        """Scenario 1: Bridge API is down - directives queued in Telnyx."""
        from retry_engine import RetryEngine
        from circuit_breaker import CircuitBreaker
        
        engine = RetryEngine(max_retries=5, base_delay=1.0)
        
        call_count = 0
        
        def bridge_api():
            nonlocal call_count
            call_count += 1
            if call_count < 4:
                raise ConnectionError("API unavailable")
            return "API recovered"
        
        # With circuit breaker to detect when API is back
        breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=2)
        
        # Simulate API being down then coming back
        time.sleep(2.5)  # Wait for circuit breaker to allow half-open
        
        result = engine.execute_with_retry(bridge_api)
        
        assert result == "API recovered"

    def test_scenario_2_postgres_connection_recovery(self):
        """Scenario 2: PostgreSQL connection lost - recovery with backoff."""
        from retry_engine import RetryEngine
        from error_handler import DatabaseError
        
        engine = RetryEngine(max_retries=3, base_delay=0.5)
        
        call_count = 0
        
        def db_operation():
            nonlocal call_count
            call_count += 1
            if call_count < 3:
                raise DatabaseError("Connection lost", context=ErrorContext(
                    correlation_id="test",
                    operation="query",
                    component="postgres"
                ))
            return "Connected"
        
        result = engine.execute_with_retry(db_operation)
        
        assert result == "Connected"

    def test_scenario_3_queue_fill_up_alert(self):
        """Scenario 3: Queue fills up - alert and reject low priority."""
        from dead_letter_queue import DeadLetterQueue, DLQFullError
        from health_monitor import HealthMonitor, HealthStatus
        
        dlq = DeadLetterQueue(max_size=100)
        monitor = HealthMonitor()
        
        # Fill queue to capacity
        for i in range(100):
            dlq.enqueue({"id": i, "priority": "high" if i % 2 == 0 else "low"}, 
                       error=Error("e"), context={})
        
        # Register queue size as health check
        def queue_health():
            if dlq.size() > 1000:
                return HealthStatus.UNHEALTHY
            elif dlq.size() > 800:
                return HealthStatus.DEGRADED
            return HealthStatus.HEALTHY
        
        monitor.register_component("queue", queue_health, critical=True)
        
        result = monitor.check_component("queue")
        
        # Should be degraded but not unhealthy
        assert result in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]

    def test_scenario_4_duplicate_deduplication(self):
        """Scenario 4: Duplicate directives - deduplication within 60s window."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker(duplicate_window_seconds=60)
        
        # Generate payload hashes
        payload1 = {"action": "turn_on", "device": "lights"}
        payload2 = {"action": "turn_on", "device": "lights"}  # Same as payload1
        payload3 = {"action": "turn_off", "device": "lights"}  # Different
        
        def hash_payload(payload):
            return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
        
        hash1 = hash_payload(payload1)
        hash2 = hash_payload(payload2)
        hash3 = hash_payload(payload3)
        
        # Same payload should produce same hash
        assert hash1 == hash2
        assert hash1 != hash3

    def test_scenario_5_expired_directives_cleanup(self):
        """Scenario 5: Expired directives - cleanup worker notification."""
        from cleanup_worker import CleanupWorker
        
        worker = CleanupWorker(max_age_hours=1)
        
        important_expired = [
            {"directive_id": "1", "priority": "critical", "action": "emergency"},
            {"directive_id": "2", "priority": "high", "action": "alert"},
            {"directive_id": "3", "priority": "medium", "action": "reminder"},
        ]
        
        notifications = []
        
        def notify(directive):
            notifications.append(directive)
        
        worker.notify_important_expired(important_expired, notify_func=notify)
        
        # Critical and high should be notified
        assert len(notifications) == 2

    def test_scenario_6_telnyx_webhook_failure(self):
        """Scenario 6: Telnyx webhook failures - dead letter queue."""
        from dead_letter_queue import DeadLetterQueue
        from error_handler import TelnyxError, ErrorContext
        
        dlq = DeadLetterQueue(max_size=1000)
        
        # Simulate webhook failure
        webhook_payload = {
            "directive_id": "test-123",
            "action": "voice_command",
            "transcript": "turn on the lights"
        }
        
        try:
            raise TelnyxError(
                message="Webhook delivery failed",
                telnyx_code="delivery_failed",
                context=ErrorContext(
                    correlation_id="webhook-123",
                    operation="webhook",
                    component="telnyx"
                )
            )
        except TelnyxError as e:
            dlq.enqueue(webhook_payload, e, {"webhook_id": "123"})
        
        assert dlq.size() == 1

    def test_scenario_7_claude_poller_crash_watchdog(self):
        """Scenario 7: Claude Code poller crashes - watchdog process."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor(check_interval=1)
        
        crash_count = 0
        
        def poller_health():
            nonlocal crash_count
            if crash_count > 3:
                return HealthStatus.UNHEALTHY
            return HealthStatus.HEALTHY
        
        monitor.register_component("claude_poller", poller_health, critical=True)
        
        # Simulate crashes
        crash_count = 5
        
        result = monitor.check_component("claude_poller")
        
        assert result == HealthStatus.UNHEALTHY

    def test_scenario_8_invalid_payload_validation(self):
        """Scenario 8: Invalid payloads - validation with clear errors."""
        from error_handler import ValidationError, ErrorContext
        
        def validate_directive(directive):
            errors = []
            
            if "directive_id" not in directive:
                errors.append("Missing directive_id")
            
            if "payload" not in directive:
                errors.append("Missing payload")
            
            if "phone_number" not in directive:
                errors.append("Missing phone_number")
            
            if errors:
                raise ValidationError(
                    message=f"Validation failed: {', '.join(errors)}",
                    context=ErrorContext(
                        correlation_id="validation-123",
                        operation="validate",
                        component="api"
                    )
                )
            
            return True
        
        # Test invalid payload
        invalid_payload = {"some_field": "value"}
        
        with pytest.raises(ValidationError) as exc:
            validate_directive(invalid_payload)
        
        assert "directive_id" in str(exc.value)

    def test_scenario_9_rate_limiting(self):
        """Scenario 9: Rate limiting exceeded - queue and process gradually."""
        from retry_engine import RetryEngine
        from error_handler import RateLimitError, ErrorContext
        
        engine = RetryEngine(max_retries=5, base_delay=1.0)
        
        call_count = 0
        
        def rate_limited_api():
            nonlocal call_count
            call_count += 1
            
            if call_count < 3:
                raise RateLimitError(
                    message="Rate limit exceeded",
                    retry_after=1,
                    context=ErrorContext(
                        correlation_id="rate-limit-123",
                        operation="api_call",
                        component="telnyx"
                    )
                )
            
            return "success"
        
        result = engine.execute_with_retry(rate_limited_api)
        
        assert result == "success"

    def test_scenario_10_network_partition_local_cache(self):
        """Scenario 10: Network partition - local cache and sync."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue(max_size=1000)
        
        # Simulate offline mode - queue locally
        offline_messages = [
            {"directive_id": f"offline-{i}", "action": "test"} 
            for i in range(10)
        ]
        
        for msg in offline_messages:
            dlq.enqueue(msg, error=Error("offline"), context={"offline": True})
        
        assert dlq.size() == 10
        
        # When reconnected, process queue
        processed = 0
        while not dlq.is_empty():
            item = dlq.dequeue()
            if item:
                processed += 1
        
        assert processed == 10


# ============================================================================
# REGRESSION TESTS
# ============================================================================

class TestRegressionTests:
    """Regression tests to prevent previously fixed bugs."""

    def test_error_context_correlation_id_not_none(self):
        """Regression: Ensure correlation ID is never None."""
        context = create_error_context("test", "test")
        assert context.correlation_id is not None
        assert len(context.correlation_id) > 0

    def test_retry_delay_never_negative(self):
        """Regression: Ensure retry delay is never negative."""
        from retry_engine import RetryEngine
        
        engine = RetryEngine(base_delay=-100)
        
        for attempt in range(10):
            delay = engine.calculate_delay(attempt)
            assert delay >= 0

    def test_circuit_breaker_state_persists(self):
        """Regression: Ensure circuit breaker state persists correctly."""
        from circuit_breaker import CircuitBreaker, CircuitState
        
        breaker = CircuitBreaker(failure_threshold=2, recovery_timeout=60)
        
        # Open the circuit
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        breaker.call(lambda: (_ for _ in ()).throw(ConnectionError("fail")))
        
        state = breaker.state
        
        # Try another operation - state should still be open
        with pytest.raises(Exception):
            breaker.call(lambda: "success")
        
        assert breaker.state == CircuitState.OPEN

    def test_dlq_size_accurate(self):
        """Regression: Ensure DLQ size is accurate."""
        from dead_letter_queue import DeadLetterQueue
        
        dlq = DeadLetterQueue()
        
        for i in range(5):
            dlq.enqueue({"id": i}, error=Error("e"), context={})
        
        assert dlq.size() == 5
        
        dlq.dequeue()
        assert dlq.size() == 4

    def test_health_monitor_thread_safety(self):
        """Regression: Ensure health monitor is thread-safe."""
        from health_monitor import HealthMonitor, HealthStatus
        
        monitor = HealthMonitor()
        
        def register_and_check():
            for i in range(10):
                name = f"component-{threading.current_thread().name}-{i}"
                monitor.register_component(
                    name,
                    lambda: HealthStatus.HEALTHY,
                    critical=False
                )
        
        threads = [threading.Thread(target=register_and_check) for _ in range(5)]
        
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        
        # All components should be registered
        assert len(monitor.components) > 0


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short"])