"""
core/coherence/bulkhead.py

BulkheadGuard — asyncio.gather Exception Isolation.

Runs multiple agent coroutines concurrently while fully isolating each
failure.  One crashed agent NEVER brings down the entire swarm.

Usage::

    guard = BulkheadGuard(cold_ledger=ledger)
    tasks = [
        ("agent-1", some_coroutine()),
        ("agent-2", another_coroutine()),
    ]
    results = await guard.run_with_bulkhead(tasks)
    rate = guard.get_success_rate(results)

# VERIFICATION_STAMP
# Story: 6.07
# Verified By: parallel-builder
# Verified At: 2026-02-25
# Tests: 13/13
# Coverage: 100%
"""

from __future__ import annotations

import asyncio
import logging
from dataclasses import dataclass, field
from typing import Any, Coroutine, Optional

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

CRITICAL_THRESHOLD: float = 0.5
"""Success rate below which a swarm_critical_failure event is emitted."""


# ---------------------------------------------------------------------------
# Result dataclass
# ---------------------------------------------------------------------------


@dataclass
class BulkheadResult:
    """
    Result of a single task run through the bulkhead.

    Attributes:
        agent_id: Identifier of the agent that ran this task.
        success:  True if the coroutine completed without exception.
        result:   Return value of the coroutine (success path).
        error:    str(exception) captured on the failure path.
    """

    agent_id: str
    success: bool
    result: Optional[dict] = field(default=None)
    error: Optional[str] = field(default=None)


# ---------------------------------------------------------------------------
# BulkheadGuard
# ---------------------------------------------------------------------------


class BulkheadGuard:
    """
    Runs multiple agent tasks concurrently while isolating failures.

    Uses ``asyncio.gather(*coros, return_exceptions=True)`` so that a
    RuntimeError or any other exception in one coroutine does NOT
    propagate to the caller — it is captured and returned as a
    ``BulkheadResult(success=False, error=...)``.

    Args:
        cold_ledger: Optional ColdLedger-compatible object that exposes an
                     async ``write_event(event_type: str, payload: dict)``
                     method.  When provided and the swarm success rate drops
                     below :data:`CRITICAL_THRESHOLD`, a
                     ``swarm_critical_failure`` event is written.
    """

    def __init__(self, cold_ledger=None) -> None:
        self.cold_ledger = cold_ledger

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    async def run_with_bulkhead(
        self,
        tasks: list[tuple[str, Coroutine[Any, Any, Any]]],
    ) -> list[BulkheadResult]:
        """
        Run tasks concurrently with full exception isolation.

        Args:
            tasks: List of ``(agent_id, coroutine)`` tuples.  The
                   coroutine should return a ``dict`` on success or raise
                   on failure.

        Returns:
            List of :class:`BulkheadResult`, one per input task, in the
            same order.  **Never raises** — all exceptions are captured.
        """
        if not tasks:
            return []

        agent_ids = [agent_id for agent_id, _ in tasks]
        coros = [coro for _, coro in tasks]

        # return_exceptions=True means exceptions are returned as values,
        # not re-raised.  This is the core of the bulkhead pattern.
        raw_outcomes = await asyncio.gather(*coros, return_exceptions=True)

        results: list[BulkheadResult] = []
        for agent_id, outcome in zip(agent_ids, raw_outcomes):
            if isinstance(outcome, BaseException):
                results.append(
                    BulkheadResult(
                        agent_id=agent_id,
                        success=False,
                        result=None,
                        error=str(outcome),
                    )
                )
                logger.warning(
                    "BulkheadGuard: agent %s raised %s: %s",
                    agent_id,
                    type(outcome).__name__,
                    outcome,
                )
            else:
                results.append(
                    BulkheadResult(
                        agent_id=agent_id,
                        success=True,
                        result=outcome,
                        error=None,
                    )
                )

        # Check for critical failure threshold
        success_rate = self.get_success_rate(results)
        if success_rate < CRITICAL_THRESHOLD:
            await self._emit_critical_event(results, success_rate)

        return results

    def get_success_rate(self, results: list[BulkheadResult]) -> float:
        """
        Calculate the fraction of successful results.

        Args:
            results: List of :class:`BulkheadResult` from a prior
                     :meth:`run_with_bulkhead` call.

        Returns:
            Float in ``[0.0, 1.0]``.  Returns ``1.0`` for an empty list
            (vacuous truth — no failures occurred).
        """
        if not results:
            return 1.0
        return sum(1 for r in results if r.success) / len(results)

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    async def _emit_critical_event(
        self,
        results: list[BulkheadResult],
        success_rate: float,
    ) -> None:
        """
        Write a ``swarm_critical_failure`` event to the ColdLedger.

        Args:
            results:      Full list of :class:`BulkheadResult` from the run.
            success_rate: Pre-calculated success rate (< CRITICAL_THRESHOLD).
        """
        if self.cold_ledger is None:
            logger.warning(
                "BulkheadGuard: critical failure (rate=%.2f) but no ColdLedger configured",
                success_rate,
            )
            return

        failed_agents = [r.agent_id for r in results if not r.success]
        payload = {
            "success_rate": success_rate,
            "total_tasks": len(results),
            "failed_count": len(failed_agents),
            "failed_agents": failed_agents,
        }

        try:
            await self.cold_ledger.write_event(
                event_type="swarm_critical_failure",
                payload=payload,
            )
            logger.critical(
                "BulkheadGuard: swarm_critical_failure written "
                "(rate=%.2f, failed=%s)",
                success_rate,
                failed_agents,
            )
        except Exception as exc:
            # Never allow ledger writes to crash the bulkhead itself
            logger.error(
                "BulkheadGuard: failed to write critical event to ColdLedger: %s",
                exc,
            )
