"""
tests/track_b/test_story_8_04.py

Story 8.04: ShadowArena — Containerized Test Sandbox

Black Box Tests (BB1–BB4):
    BB1  Proposal that fixes 9/10 failed sagas → pass_rate=0.9, ready_for_pr=True
    BB2  Proposal with axiom violation → ready_for_pr=False regardless of pass_rate
    BB3  Arena run written to shadow_arena_runs.jsonl (tmp_path)
    BB4  pass_rate=0.7 (below 0.8) → ready_for_pr=False even with no axiom violations

White Box Tests (WB1–WB4):
    WB1  Shadow mode blocks external calls (Redis keys use SHADOW prefix)
    WB2  ready_for_pr requires BOTH pass_rate >= 0.8 AND axiom_violations == []
    WB3  Axiom check uses AxiomaticTests.run_all() (mock verified)
    WB4  improved_metrics contains old_success_rate and new_success_rate

ALL tests use mocks — no real Postgres/Redis. tmp_path used for file I/O.
"""

from __future__ import annotations

import json
import sys
from pathlib import Path
from unittest.mock import MagicMock, call, patch

import pytest

# ---------------------------------------------------------------------------
# Path bootstrap
# ---------------------------------------------------------------------------

GENESIS_ROOT = "/mnt/e/genesis-system"
if GENESIS_ROOT not in sys.path:
    sys.path.insert(0, GENESIS_ROOT)

# ---------------------------------------------------------------------------
# Imports under test
# ---------------------------------------------------------------------------

from core.evolution.shadow_arena import (  # noqa: E402
    ShadowArena,
    ArenaResult,
    SHADOW_PREFIX,
)


# ---------------------------------------------------------------------------
# Helpers / factories
# ---------------------------------------------------------------------------


def _make_pg(sagas: list[dict]) -> MagicMock:
    """Build a mock Postgres connection that returns the supplied sagas."""
    rows = [(s["saga_id"], s.get("inputs", {}), s.get("success", False)) for s in sagas]
    cursor = MagicMock()
    cursor.fetchall.return_value = rows
    pg = MagicMock()
    pg.cursor.return_value = cursor
    return pg


def _make_redis() -> MagicMock:
    """Build a mock Redis client."""
    redis = MagicMock()
    redis.set = MagicMock()
    redis.get = MagicMock(return_value=None)
    return redis


def _make_axiomatic_clean() -> MagicMock:
    """Return a mock AxiomaticTests that reports no violations."""
    ax = MagicMock()
    ax_result = MagicMock()
    ax_result.violations = []
    ax.run_all.return_value = ax_result
    return ax


def _make_axiomatic_violated(violation_id: str = "AXIOM_NO_SQLITE") -> MagicMock:
    """Return a mock AxiomaticTests that reports one axiom violation."""
    ax = MagicMock()
    violation = MagicMock()
    violation.axiom_id = violation_id
    ax_result = MagicMock()
    ax_result.violations = [violation]
    ax.run_all.return_value = ax_result
    return ax


def _make_sagas(count: int, *, success: bool = False) -> list[dict]:
    """Generate `count` saga dicts, all with the same success flag."""
    return [
        {"saga_id": f"saga-{i:03d}", "inputs": {"step": i}, "success": success}
        for i in range(count)
    ]


def _make_arena(
    sagas: list[dict],
    axiomatic_tests,
    tmp_path: Path,
    redis=None,
) -> ShadowArena:
    """Construct a ShadowArena with mocked PG + Redis pointing at tmp_path log."""
    pg = _make_pg(sagas)
    redis = redis if redis is not None else _make_redis()
    log_file = tmp_path / "shadow_arena_runs.jsonl"
    return ShadowArena(
        pg_connection=pg,
        redis_client=redis,
        axiomatic_tests=axiomatic_tests,
        log_path=str(log_file),
    )


# ---------------------------------------------------------------------------
# A lightweight fake "proposal module" that passes all sagas
# ---------------------------------------------------------------------------

class _FakePassModule:
    """Fake proposed module: run_saga always returns True."""
    @staticmethod
    def run_saga(_inputs: dict) -> bool:
        return True


class _FakeFailModule:
    """Fake proposed module: run_saga always returns False."""
    @staticmethod
    def run_saga(_inputs: dict) -> bool:
        return False


class _FakeNineOutOfTenModule:
    """Fake proposed module: first 9 calls pass, 10th fails."""
    def __init__(self):
        self._count = 0

    def run_saga(self, _inputs: dict) -> bool:
        self._count += 1
        return self._count <= 9


# ===========================================================================
# BB TESTS — Black Box
# ===========================================================================


def test_bb1_nine_of_ten_fixed_pass_rate_09_ready_for_pr(tmp_path):
    """BB1: Proposal that fixes 9/10 failed sagas → pass_rate=0.9, ready_for_pr=True."""
    sagas = _make_sagas(10, success=False)

    ax = _make_axiomatic_clean()
    arena = _make_arena(sagas, ax, tmp_path)

    module_instance = _FakeNineOutOfTenModule()

    with patch.object(ShadowArena, "_try_import_module", return_value=module_instance):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.candidate_v1",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    assert isinstance(result, ArenaResult)
    assert abs(result.pass_rate - 0.9) < 1e-9, f"Expected 0.9, got {result.pass_rate}"
    assert result.ready_for_pr is True, "Expected ready_for_pr=True for pass_rate=0.9 + no violations"
    assert result.axiom_violations == []


def test_bb2_axiom_violation_blocks_ready_for_pr(tmp_path):
    """BB2: Proposal with axiom violation → ready_for_pr=False regardless of pass_rate."""
    sagas = _make_sagas(10, success=False)

    ax = _make_axiomatic_violated("AXIOM_NO_SQLITE")
    arena = _make_arena(sagas, ax, tmp_path)

    # Module that passes ALL sagas (pass_rate = 1.0)
    with patch.object(ShadowArena, "_try_import_module", return_value=_FakePassModule()):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.bad_candidate",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    assert result.pass_rate == 1.0, f"Expected pass_rate=1.0, got {result.pass_rate}"
    assert "AXIOM_NO_SQLITE" in result.axiom_violations
    assert result.ready_for_pr is False, (
        "Expected ready_for_pr=False when axiom violation present, even with pass_rate=1.0"
    )


def test_bb3_arena_run_written_to_jsonl(tmp_path):
    """BB3: Arena run is written to shadow_arena_runs.jsonl after evaluate_proposal."""
    sagas = _make_sagas(5, success=False)
    ax = _make_axiomatic_clean()
    log_file = tmp_path / "shadow_arena_runs.jsonl"
    arena = ShadowArena(
        pg_connection=_make_pg(sagas),
        redis_client=_make_redis(),
        axiomatic_tests=ax,
        log_path=str(log_file),
    )

    with patch.object(ShadowArena, "_try_import_module", return_value=_FakePassModule()):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.any_branch",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    # File must have been created
    assert log_file.exists(), "shadow_arena_runs.jsonl was not created"

    lines = log_file.read_text(encoding="utf-8").strip().splitlines()
    assert len(lines) == 1, f"Expected 1 log line, got {len(lines)}"

    record = json.loads(lines[0])
    assert record["proposal_branch"] == "core.evolution.any_branch"
    assert record["sagas_fetched"] == 5
    assert "pass_rate" in record
    assert "axiom_violations" in record
    assert "ready_for_pr" in record
    assert "improved_metrics" in record
    assert "timestamp" in record


def test_bb4_low_pass_rate_blocks_ready_for_pr(tmp_path):
    """BB4: pass_rate=0.7 (below 0.8) → ready_for_pr=False even with no axiom violations."""
    sagas = _make_sagas(10, success=False)
    ax = _make_axiomatic_clean()
    arena = _make_arena(sagas, ax, tmp_path)

    # Module passes only 7 of 10 sagas
    call_counter = {"n": 0}

    class _SevenOutOfTenModule:
        def run_saga(self, _inputs):
            call_counter["n"] += 1
            return call_counter["n"] <= 7

    with patch.object(ShadowArena, "_try_import_module", return_value=_SevenOutOfTenModule()):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.weak_candidate",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    assert abs(result.pass_rate - 0.7) < 1e-9, f"Expected 0.7, got {result.pass_rate}"
    assert result.axiom_violations == []
    assert result.ready_for_pr is False, (
        "Expected ready_for_pr=False when pass_rate=0.7 (threshold is 0.8)"
    )


# ===========================================================================
# WB TESTS — White Box
# ===========================================================================


def test_wb1_shadow_mode_uses_shadow_prefix(tmp_path):
    """WB1: Shadow mode blocks external calls — Redis keys use SHADOW prefix."""
    sagas = _make_sagas(3, success=False)
    redis = _make_redis()
    ax = _make_axiomatic_clean()
    log_file = tmp_path / "shadow_arena_runs.jsonl"

    arena = ShadowArena(
        pg_connection=_make_pg(sagas),
        redis_client=redis,
        axiomatic_tests=ax,
        log_path=str(log_file),
    )

    with patch.object(ShadowArena, "_try_import_module", return_value=_FakePassModule()):
        arena.evaluate_proposal(
            proposal_branch="core.evolution.candidate",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    # redis.set must have been called once per saga with SHADOW-prefixed key
    assert redis.set.call_count == len(sagas), (
        f"Expected {len(sagas)} redis.set calls, got {redis.set.call_count}"
    )
    for c in redis.set.call_args_list:
        key = c.args[0] if c.args else c[0][0]
        assert key.startswith(SHADOW_PREFIX), (
            f"Redis key '{key}' does not start with SHADOW prefix '{SHADOW_PREFIX}'"
        )


def test_wb2_ready_for_pr_requires_both_conditions(tmp_path):
    """WB2: ready_for_pr requires BOTH pass_rate >= 0.8 AND axiom_violations == []."""

    def _run(pass_count: int, total: int, has_violation: bool, tmp_p: Path) -> ArenaResult:
        sagas = _make_sagas(total, success=False)
        ax = _make_axiomatic_violated() if has_violation else _make_axiomatic_clean()
        arena = _make_arena(sagas, ax, tmp_p)

        counter = {"n": 0}

        class _CountModule:
            def run_saga(self, _i):
                counter["n"] += 1
                return counter["n"] <= pass_count

        with patch.object(ShadowArena, "_try_import_module", return_value=_CountModule()):
            return arena.evaluate_proposal(
                proposal_branch="core.evolution.test_branch",
                test_saga_ids=[s["saga_id"] for s in sagas],
            )

    # Case A: pass >= 0.8 AND no violations → True
    r_a = _run(9, 10, has_violation=False, tmp_p=tmp_path / "a")
    assert r_a.ready_for_pr is True, f"Expected True but got {r_a.ready_for_pr}"

    # Case B: pass >= 0.8 BUT violations exist → False
    r_b = _run(9, 10, has_violation=True, tmp_p=tmp_path / "b")
    assert r_b.ready_for_pr is False, f"Expected False (violations) but got {r_b.ready_for_pr}"

    # Case C: no violations BUT pass < 0.8 → False
    r_c = _run(7, 10, has_violation=False, tmp_p=tmp_path / "c")
    assert r_c.ready_for_pr is False, f"Expected False (low pass) but got {r_c.ready_for_pr}"

    # Case D: low pass AND violations → False
    r_d = _run(5, 10, has_violation=True, tmp_p=tmp_path / "d")
    assert r_d.ready_for_pr is False, f"Expected False (both) but got {r_d.ready_for_pr}"


def test_wb3_axiom_check_calls_run_all(tmp_path):
    """WB3: Axiom check uses AxiomaticTests.run_all() — mock verified via call assertion."""
    sagas = _make_sagas(4, success=False)
    ax = _make_axiomatic_clean()
    arena = _make_arena(sagas, ax, tmp_path)

    with patch.object(ShadowArena, "_try_import_module", return_value=_FakePassModule()):
        arena.evaluate_proposal(
            proposal_branch="core.evolution.some_branch",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    # AxiomaticTests.run_all must have been called exactly once
    assert ax.run_all.call_count == 1, (
        f"Expected ax.run_all to be called once, got {ax.run_all.call_count}"
    )
    # Call signature: run_all(code_content=..., state_content=...)
    call_kwargs = ax.run_all.call_args
    assert call_kwargs is not None
    # Accept both positional and keyword arguments
    kwargs = call_kwargs.kwargs if call_kwargs.kwargs else {}
    if not kwargs and call_kwargs.args:
        # Positional args: (code_content, state_content)
        assert len(call_kwargs.args) >= 1
    else:
        assert "code_content" in kwargs, (
            f"Expected 'code_content' kwarg, got: {kwargs}"
        )
        assert "state_content" in kwargs, (
            f"Expected 'state_content' kwarg, got: {kwargs}"
        )


def test_wb4_improved_metrics_contains_success_rates(tmp_path):
    """WB4: improved_metrics contains old_success_rate and new_success_rate."""
    # All 10 sagas historically failed; new branch fixes all 10
    sagas = _make_sagas(10, success=False)
    ax = _make_axiomatic_clean()
    arena = _make_arena(sagas, ax, tmp_path)

    with patch.object(ShadowArena, "_try_import_module", return_value=_FakePassModule()):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.perfect_branch",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    metrics = result.improved_metrics
    assert "old_success_rate" in metrics, f"Missing old_success_rate in {metrics}"
    assert "new_success_rate" in metrics, f"Missing new_success_rate in {metrics}"

    # All sagas historically failed → old_success_rate = 0.0
    assert metrics["old_success_rate"] == 0.0, (
        f"Expected old_success_rate=0.0, got {metrics['old_success_rate']}"
    )
    # Branch fixes all → new_success_rate = 1.0
    assert metrics["new_success_rate"] == 1.0, (
        f"Expected new_success_rate=1.0, got {metrics['new_success_rate']}"
    )
    # Delta should be positive
    assert metrics["delta"] > 0, (
        f"Expected delta > 0 when improvement occurred, got {metrics['delta']}"
    )


# ===========================================================================
# Additional edge-case tests
# ===========================================================================


def test_arena_result_is_dataclass():
    """ArenaResult is a proper dataclass with the required fields."""
    import dataclasses

    assert dataclasses.is_dataclass(ArenaResult)
    field_names = {f.name for f in dataclasses.fields(ArenaResult)}
    assert "pass_rate" in field_names
    assert "axiom_violations" in field_names
    assert "improved_metrics" in field_names
    assert "ready_for_pr" in field_names


def test_arena_with_no_pg_connection_uses_synthetic_sagas(tmp_path):
    """Arena operates without a real Postgres connection (synthetic saga fallback)."""
    ax = _make_axiomatic_clean()
    log_file = tmp_path / "shadow_arena_runs.jsonl"
    arena = ShadowArena(
        pg_connection=None,
        redis_client=None,
        axiomatic_tests=ax,
        log_path=str(log_file),
    )

    with patch.object(ShadowArena, "_try_import_module", return_value=_FakePassModule()):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.any",
            test_saga_ids=["saga-a", "saga-b", "saga-c"],
        )

    # All synthetic sagas start as failed; FakePassModule fixes all → pass_rate=1.0
    assert result.pass_rate == 1.0
    assert result.ready_for_pr is True


def test_exact_pass_rate_boundary_08_is_ready(tmp_path):
    """Exactly 8/10 sagas passing (pass_rate=0.8) → ready_for_pr=True."""
    sagas = _make_sagas(10, success=False)
    ax = _make_axiomatic_clean()
    arena = _make_arena(sagas, ax, tmp_path)

    counter = {"n": 0}

    class _EightOutOfTenModule:
        def run_saga(self, _i):
            counter["n"] += 1
            return counter["n"] <= 8

    with patch.object(ShadowArena, "_try_import_module", return_value=_EightOutOfTenModule()):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.boundary_branch",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    assert abs(result.pass_rate - 0.8) < 1e-9
    assert result.ready_for_pr is True, "Exactly 0.8 should be True (>= 0.8)"


def test_importable_module_without_run_saga_still_passes_sagas(tmp_path):
    """Module with no run_saga function treats all sagas as passed (no regression)."""
    sagas = _make_sagas(5, success=False)
    ax = _make_axiomatic_clean()
    arena = _make_arena(sagas, ax, tmp_path)

    class _NoRunSagaModule:
        pass  # no run_saga method

    with patch.object(ShadowArena, "_try_import_module", return_value=_NoRunSagaModule()):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.minimal_branch",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    assert result.pass_rate == 1.0
    assert result.ready_for_pr is True


def test_failed_import_fails_all_sagas(tmp_path):
    """When module import fails (returns None), all sagas fail → pass_rate=0.0."""
    sagas = _make_sagas(5, success=False)
    ax = _make_axiomatic_clean()
    arena = _make_arena(sagas, ax, tmp_path)

    with patch.object(ShadowArena, "_try_import_module", return_value=None):
        result = arena.evaluate_proposal(
            proposal_branch="core.evolution.nonexistent_branch",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    assert result.pass_rate == 0.0
    assert result.ready_for_pr is False


def test_multiple_arena_runs_append_to_jsonl(tmp_path):
    """Multiple evaluate_proposal calls append separate lines to the JSONL log."""
    sagas = _make_sagas(3, success=False)
    ax = _make_axiomatic_clean()
    log_file = tmp_path / "shadow_arena_runs.jsonl"
    arena = ShadowArena(
        pg_connection=_make_pg(sagas),
        redis_client=_make_redis(),
        axiomatic_tests=ax,
        log_path=str(log_file),
    )

    with patch.object(ShadowArena, "_try_import_module", return_value=_FakePassModule()):
        arena.evaluate_proposal(
            proposal_branch="core.evolution.v1",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )
        arena.evaluate_proposal(
            proposal_branch="core.evolution.v2",
            test_saga_ids=[s["saga_id"] for s in sagas],
        )

    lines = log_file.read_text(encoding="utf-8").strip().splitlines()
    assert len(lines) == 2, f"Expected 2 JSONL records, got {len(lines)}"
    branches = [json.loads(l)["proposal_branch"] for l in lines]
    assert "core.evolution.v1" in branches
    assert "core.evolution.v2" in branches


def test_pkg_exports_shadow_arena_and_arena_result():
    """core.evolution __init__.py exports ShadowArena and ArenaResult."""
    from core.evolution import ShadowArena as SA, ArenaResult as AR
    assert SA is ShadowArena
    assert AR is ArenaResult


# ===========================================================================
# Standalone runner
# ===========================================================================

if __name__ == "__main__":
    import traceback
    import tempfile

    with tempfile.TemporaryDirectory() as td:
        tmp = Path(td)

        tests = [
            ("BB1: 9/10 sagas fixed → pass_rate=0.9, ready_for_pr=True",
             lambda: test_bb1_nine_of_ten_fixed_pass_rate_09_ready_for_pr(tmp / "bb1")),
            ("BB2: axiom violation → ready_for_pr=False regardless of pass_rate",
             lambda: test_bb2_axiom_violation_blocks_ready_for_pr(tmp / "bb2")),
            ("BB3: arena run written to shadow_arena_runs.jsonl",
             lambda: test_bb3_arena_run_written_to_jsonl(tmp / "bb3")),
            ("BB4: pass_rate=0.7 → ready_for_pr=False",
             lambda: test_bb4_low_pass_rate_blocks_ready_for_pr(tmp / "bb4")),
            ("WB1: shadow mode uses SHADOW prefix on Redis keys",
             lambda: test_wb1_shadow_mode_uses_shadow_prefix(tmp / "wb1")),
            ("WB2: ready_for_pr requires BOTH conditions",
             lambda: test_wb2_ready_for_pr_requires_both_conditions(tmp / "wb2")),
            ("WB3: axiom check calls AxiomaticTests.run_all()",
             lambda: test_wb3_axiom_check_calls_run_all(tmp / "wb3")),
            ("WB4: improved_metrics has old/new success_rate",
             lambda: test_wb4_improved_metrics_contains_success_rates(tmp / "wb4")),
            ("EDGE: ArenaResult is a proper dataclass",
             lambda: test_arena_result_is_dataclass()),
            ("EDGE: no PG connection → synthetic saga fallback",
             lambda: test_arena_with_no_pg_connection_uses_synthetic_sagas(tmp / "e1")),
            ("EDGE: exactly 0.8 pass_rate → ready_for_pr=True",
             lambda: test_exact_pass_rate_boundary_08_is_ready(tmp / "e2")),
            ("EDGE: module without run_saga passes all sagas",
             lambda: test_importable_module_without_run_saga_still_passes_sagas(tmp / "e3")),
            ("EDGE: import failure fails all sagas",
             lambda: test_failed_import_fails_all_sagas(tmp / "e4")),
            ("EDGE: multiple runs append to JSONL",
             lambda: test_multiple_arena_runs_append_to_jsonl(tmp / "e5")),
            ("PKG: core.evolution exports ShadowArena + ArenaResult",
             lambda: test_pkg_exports_shadow_arena_and_arena_result()),
        ]

    passed = 0
    total = len(tests)
    for name, fn in tests:
        with tempfile.TemporaryDirectory() as td2:
            tmp = Path(td2)
            try:
                fn()
                print(f"  [PASS] {name}")
                passed += 1
            except Exception as exc:
                print(f"  [FAIL] {name}: {exc}")
                traceback.print_exc()

    print(f"\n{passed}/{total} tests passed")
    if passed == total:
        print("ALL TESTS PASSED -- Story 8.04 (Track B)")
    else:
        sys.exit(1)