#!/usr/bin/env python3
"""
Story 3.07 — MVFL Pipeline Integration Test Suite
Tests the full MVFL error-correction pipeline end-to-end.

Coverage:
  BB1:  Syntax error → CorrectionLoop runs → clean output returned on attempt 2
  BB2:  Qdrant scar match (high similarity) → VoyagerDefense blocks → CorrectionLoop runs
  BB3:  3-strike failure → MemGPTEscalation called
  BB4:  External rejection (status_code 422) → triggers external_rejection (severity 3)
  BB5:  Semantic inconsistency (status=completed + result=None) → triggers semantic (severity 2)
  BB6:  Clean output → no trigger, no correction, passes through unchanged
  BB7:  MVFLInterceptor post_execute → triggered output gets mvfl_corrected=True
  BB8:  MVFLInterceptor pre_execute → passes through unchanged (identity)

  WB9:  MVFLTrigger priority: external > semantic > syntax (both present → external wins)
  WB10: CorrectionLoop MAX_CORRECTION_ATTEMPTS constant = 3
  WB11: CorrectionLoop correction prompt always starts with "CORRECTION: "
  WB12: MemGPTEscalation uses ESCALATION_MODEL = "claude-opus-4-6"
  WB13: VoyagerDefense BLOCK_THRESHOLD default = 0.7
  WB14: OutputValidator collects ALL errors (not fail-fast)
  WB15: MVFLInterceptor priority = 90

  INT16: Full pipeline: trigger → voyager → correction → success on attempt 2
  INT17: Full pipeline: trigger → voyager → 3 failures → escalation → Opus result returned
"""
import asyncio
import json
import os
import sys
import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch, call

sys.path.insert(0, '/mnt/e/genesis-system')


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def run(coro):
    """Run a coroutine synchronously for test purposes."""
    return asyncio.get_event_loop().run_until_complete(coro)


def _trigger_result(triggered: bool, trigger_type: str = "syntax",
                    severity: int = 1, details: str = "error detected"):
    """Build an MVFLTriggerResult for injection."""
    from core.mvfl.mvfl_trigger import MVFLTriggerResult
    return MVFLTriggerResult(
        triggered=triggered,
        trigger_type=trigger_type if triggered else None,
        severity=severity if triggered else 0,
        details=details if triggered else "Clean output",
    )


def _voyager_score(should_block: bool = False, score: float = 0.0):
    """Build a VoyagerScore for injection."""
    from core.mvfl.voyager_defense import VoyagerScore
    return VoyagerScore(score=score, matched_scars=[], should_block=should_block)


def _correction_result(success: bool, attempts: int = 1, escalated: bool = False):
    """Build a CorrectionResult for injection."""
    from core.mvfl.correction_loop import CorrectionResult
    if success:
        output = {"task_id": "t1", "status": "completed", "output": "corrected"}
    else:
        output = {"task_id": "t1", "status": "error", "error": "MVFL_ESCALATION_REQUIRED"}
    return CorrectionResult(
        success=success,
        output=output,
        attempts=attempts,
        escalated=escalated,
    )


def _make_trigger_mock(triggered=False, trigger_type="syntax",
                       severity=1, details="error detected"):
    """Return a mock MVFLTrigger with a fixed evaluate() return value."""
    mock = MagicMock()
    mock.evaluate.return_value = _trigger_result(
        triggered, trigger_type, severity, details
    )
    return mock


def _make_trigger_mock_sequence(responses):
    """
    Return a mock MVFLTrigger whose evaluate() returns successive
    MVFLTriggerResults from the given list. Repeats last on overflow.
    """
    mock = MagicMock()
    state = {"n": 0}

    def side_effect(output, payload):
        idx = min(state["n"], len(responses) - 1)
        state["n"] += 1
        return responses[idx]

    mock.evaluate.side_effect = side_effect
    return mock


def _make_voyager_mock(should_block=False, score=0.0):
    """Return a mock VoyagerDefense with a fixed score() return value."""
    from core.mvfl.voyager_defense import VoyagerScore
    mock = MagicMock()
    mock.score.return_value = VoyagerScore(
        score=score, matched_scars=[], should_block=should_block
    )
    return mock


def _make_correction_loop_mock(success=True, attempts=1, escalated=False):
    """Return a mock CorrectionLoop with a controlled async run()."""
    mock = MagicMock()
    mock.run = AsyncMock(
        return_value=_correction_result(success, attempts, escalated)
    )
    return mock


def _make_task(task_id: str = "task-001", prompt: str = "Do the thing.") -> dict:
    return {"task_id": task_id, "prompt": prompt}


def _make_interceptor(trigger=None, voyager=None, correction_loop=None):
    """Build an MVFLInterceptor with all dependencies mocked by default."""
    from core.mvfl.mvfl_interceptor import MVFLInterceptor
    return MVFLInterceptor(
        trigger=trigger or _make_trigger_mock(triggered=False),
        voyager=voyager or _make_voyager_mock(should_block=False),
        correction_loop=correction_loop or MagicMock(),
    )


# ===========================================================================
# BB TESTS — End-to-end pipeline scenarios
# ===========================================================================

# ---------------------------------------------------------------------------
# BB1: Syntax error in output → CorrectionLoop runs → clean output on attempt 2
# ---------------------------------------------------------------------------

def test_bb1_syntax_error_corrected_on_attempt_2():
    """
    BB1: A syntax-triggered output (missing task_id) runs through
    CorrectionLoop and succeeds on the 2nd attempt.
    """
    from core.mvfl.mvfl_trigger import MVFLTrigger
    from core.mvfl.correction_loop import CorrectionLoop

    # attempt 1 → still bad; attempt 2 → clean
    trigger_mock = _make_trigger_mock_sequence([
        _trigger_result(True, "syntax", 1, "Missing required field: task_id"),
        _trigger_result(False),
    ])
    voyager_mock = _make_voyager_mock(should_block=False)

    dispatch = AsyncMock(
        return_value={"task_id": "t-bb1", "status": "completed", "output": "fixed"}
    )

    loop = CorrectionLoop(
        trigger=trigger_mock,
        voyager=voyager_mock,
        dispatch_fn=dispatch,
    )

    initial_trigger = _trigger_result(True, "syntax", 1, "Missing required field: task_id")
    result = run(loop.run(
        task_payload={"task_id": "t-bb1", "prompt": "Synthesize report"},
        failed_output={"status": "completed"},  # missing task_id
        trigger_result=initial_trigger,
    ))

    assert result.success is True, f"BB1: Expected success=True, got {result}"
    assert result.attempts == 2, f"BB1: Expected 2 attempts, got {result.attempts}"
    assert result.escalated is False, f"BB1: Expected not escalated, got {result}"
    assert result.output.get("status") == "completed", f"BB1: Bad output dict: {result.output}"
    print("BB1 PASSED — syntax error corrected on attempt 2")


# ---------------------------------------------------------------------------
# BB2: Qdrant scar match → VoyagerDefense blocks → CorrectionLoop runs
# ---------------------------------------------------------------------------

def test_bb2_voyager_scar_match_triggers_correction():
    """
    BB2: When VoyagerDefense returns should_block=True (high scar similarity),
    MVFLInterceptor must invoke CorrectionLoop even if MVFLTrigger is clean.
    """
    correction_mock = _make_correction_loop_mock(success=True, attempts=1)

    interceptor = _make_interceptor(
        trigger=_make_trigger_mock(triggered=False),      # trigger is clean
        voyager=_make_voyager_mock(should_block=True, score=0.92),  # voyager blocks
        correction_loop=correction_mock,
    )

    result = {"task_id": "t-bb2", "status": "completed", "output": "suspicious content"}
    run(interceptor.post_execute(result, _make_task("t-bb2")))

    correction_mock.run.assert_awaited_once()
    assert result.get("mvfl_corrected") is True, (
        f"BB2: Expected mvfl_corrected=True after voyager block, got: {result}"
    )
    print("BB2 PASSED — Qdrant scar match blocks and triggers CorrectionLoop")


# ---------------------------------------------------------------------------
# BB3: 3-strike failure → MemGPTEscalation called
# ---------------------------------------------------------------------------

def test_bb3_three_strike_triggers_memgpt_escalation():
    """
    BB3: Three consecutive failed correction attempts trigger MemGPTEscalation.
    The returned CorrectionResult must have escalated=True.
    """
    from core.mvfl.correction_loop import CorrectionLoop
    from core.mvfl.memgpt_escalation import MemGPTEscalation

    trigger_mock = _make_trigger_mock_sequence([
        _trigger_result(True, "external_rejection", 3, "HTTP 503"),
        _trigger_result(True, "external_rejection", 3, "HTTP 503"),
        _trigger_result(True, "external_rejection", 3, "HTTP 503"),
    ])
    voyager_mock = _make_voyager_mock(should_block=False)
    dispatch = AsyncMock(
        return_value={"task_id": "t-bb3", "status_code": 503}
    )

    escalation_output = {
        "task_id": "t-bb3",
        "status": "completed",
        "output": "Opus resolved it",
        "model": "claude-opus-4-6",
    }
    escalation_dispatch = AsyncMock(return_value=escalation_output)

    escalation = MemGPTEscalation(dispatch_fn=escalation_dispatch)

    async def escalation_fn(task_payload, failed_output):
        return await escalation.escalate(task_payload, failed_output)

    loop = CorrectionLoop(
        trigger=trigger_mock,
        voyager=voyager_mock,
        dispatch_fn=dispatch,
        escalation_fn=escalation_fn,
    )

    initial_trigger = _trigger_result(True, "external_rejection", 3, "HTTP 503")
    result = run(loop.run(
        task_payload={"task_id": "t-bb3", "prompt": "Call external API"},
        failed_output={"task_id": "t-bb3", "status_code": 503},
        trigger_result=initial_trigger,
    ))

    assert result.success is False, f"BB3: Expected success=False after 3-strike, got {result}"
    assert result.escalated is True, f"BB3: Expected escalated=True, got {result}"
    assert result.attempts == 3, f"BB3: Expected 3 attempts, got {result.attempts}"
    # escalation dispatch was called
    escalation_dispatch.assert_awaited_once()
    print("BB3 PASSED — 3-strike failure triggers MemGPTEscalation")


# ---------------------------------------------------------------------------
# BB4: External rejection (status_code 422) → triggers external_rejection (severity 3)
# ---------------------------------------------------------------------------

def test_bb4_external_rejection_422_triggers_severity_3():
    """
    BB4: An output with status_code=422 must trigger the external_rejection
    condition at severity 3 (highest priority).
    """
    from core.mvfl.mvfl_trigger import MVFLTrigger

    trigger = MVFLTrigger()
    output = {"task_id": "t-bb4", "status_code": 422, "status": "error"}
    result = trigger.evaluate(output, {})

    assert result.triggered is True, f"BB4: Expected triggered=True for 422, got {result}"
    assert result.trigger_type == "external_rejection", (
        f"BB4: Expected 'external_rejection', got '{result.trigger_type}'"
    )
    assert result.severity == 3, f"BB4: Expected severity=3, got {result.severity}"
    assert "422" in result.details, f"BB4: Expected '422' in details: {result.details}"
    print("BB4 PASSED — status_code 422 triggers external_rejection at severity 3")


# ---------------------------------------------------------------------------
# BB5: Semantic inconsistency (status=completed + result=None) → semantic, severity 2
# ---------------------------------------------------------------------------

def test_bb5_semantic_inconsistency_completed_with_no_result():
    """
    BB5: An output with status='completed' but result=None must trigger
    the semantic inconsistency condition at severity 2.
    """
    from core.mvfl.mvfl_trigger import MVFLTrigger

    trigger = MVFLTrigger()
    output = {
        "task_id": "t-bb5",
        "status": "completed",
        "result": None,
    }
    result = trigger.evaluate(output, {})

    assert result.triggered is True, f"BB5: Expected triggered=True, got {result}"
    assert result.trigger_type == "semantic", (
        f"BB5: Expected 'semantic', got '{result.trigger_type}'"
    )
    assert result.severity == 2, f"BB5: Expected severity=2, got {result.severity}"
    assert "result=None" in result.details or "completed" in result.details, (
        f"BB5: Expected relevant details, got: {result.details}"
    )
    print("BB5 PASSED — semantic inconsistency (completed + result=None) detected at severity 2")


# ---------------------------------------------------------------------------
# BB6: Clean output → no trigger, no correction, passes through unchanged
# ---------------------------------------------------------------------------

def test_bb6_clean_output_no_trigger_no_correction():
    """
    BB6: A fully valid output must pass through MVFLInterceptor with zero
    modifications — no mvfl_corrected, no mvfl_escalated keys added.
    """
    correction_mock = _make_correction_loop_mock(success=True)

    interceptor = _make_interceptor(
        trigger=_make_trigger_mock(triggered=False),
        voyager=_make_voyager_mock(should_block=False),
        correction_loop=correction_mock,
    )

    result = {
        "task_id": "t-bb6",
        "status": "completed",
        "output": "Report generated successfully",
    }
    original_keys = set(result.keys())

    run(interceptor.post_execute(result, _make_task("t-bb6")))

    assert "mvfl_corrected" not in result, (
        f"BB6: Clean output should not get mvfl_corrected key. result={result}"
    )
    assert "mvfl_escalated" not in result, (
        f"BB6: Clean output should not get mvfl_escalated key. result={result}"
    )
    correction_mock.run.assert_not_awaited()
    assert set(result.keys()) == original_keys, (
        f"BB6: No keys should be added to a clean result. Extra: {set(result.keys()) - original_keys}"
    )
    print("BB6 PASSED — clean output passes through unchanged, CorrectionLoop not called")


# ---------------------------------------------------------------------------
# BB7: MVFLInterceptor post_execute → triggered output gets mvfl_corrected=True
# ---------------------------------------------------------------------------

def test_bb7_post_execute_triggered_output_gets_mvfl_corrected():
    """
    BB7: When post_execute processes a triggered output and CorrectionLoop
    succeeds, the result dict must contain mvfl_corrected=True.
    """
    correction_mock = _make_correction_loop_mock(success=True, attempts=1)

    interceptor = _make_interceptor(
        trigger=_make_trigger_mock(triggered=True, trigger_type="syntax"),
        voyager=_make_voyager_mock(should_block=False),
        correction_loop=correction_mock,
    )

    result = {"task_id": "t-bb7", "status": "error", "error": "bad format"}
    run(interceptor.post_execute(result, _make_task("t-bb7")))

    assert result.get("mvfl_corrected") is True, (
        f"BB7: Expected mvfl_corrected=True in result, got: {result}"
    )
    assert result.get("mvfl_attempts") == 1, (
        f"BB7: Expected mvfl_attempts=1, got: {result.get('mvfl_attempts')}"
    )
    correction_mock.run.assert_awaited_once()
    print("BB7 PASSED — post_execute sets mvfl_corrected=True on triggered output")


# ---------------------------------------------------------------------------
# BB8: MVFLInterceptor pre_execute → passes through unchanged (identity)
# ---------------------------------------------------------------------------

def test_bb8_pre_execute_is_identity():
    """
    BB8: pre_execute must return the exact same dict it receives, unmodified.
    MVFL acts on results, never on inputs.
    """
    interceptor = _make_interceptor()

    payload = {
        "task_id": "t-bb8",
        "prompt": "Generate a report",
        "custom_key": 99,
    }
    returned = run(interceptor.pre_execute(payload))

    assert returned is payload, (
        "BB8: pre_execute must return the same dict object (identity check)"
    )
    assert returned == {
        "task_id": "t-bb8",
        "prompt": "Generate a report",
        "custom_key": 99,
    }, f"BB8: Payload should be unmodified, got: {returned}"
    print("BB8 PASSED — pre_execute is a pure identity pass-through")


# ===========================================================================
# WB TESTS — Internals
# ===========================================================================

# ---------------------------------------------------------------------------
# WB9: MVFLTrigger condition priority: external > semantic > syntax
# ---------------------------------------------------------------------------

def test_wb9_trigger_priority_external_wins_over_semantic():
    """
    WB9: When both external rejection AND semantic inconsistency are present,
    MVFLTrigger must return external_rejection (severity 3) — highest priority first.
    """
    from core.mvfl.mvfl_trigger import MVFLTrigger

    trigger = MVFLTrigger()
    # status_code=422 → external_rejection
    # status='completed' + result=None → semantic
    # Both conditions are true simultaneously
    output = {
        "task_id": "t-wb9",
        "status_code": 422,
        "status": "completed",
        "result": None,
    }
    result = trigger.evaluate(output, {})

    assert result.triggered is True, f"WB9: Expected triggered=True"
    assert result.trigger_type == "external_rejection", (
        f"WB9: Expected external_rejection to win priority, got: {result.trigger_type}"
    )
    assert result.severity == 3, (
        f"WB9: External rejection severity must be 3, got: {result.severity}"
    )
    print("WB9 PASSED — external_rejection wins priority over semantic when both present")


def test_wb9b_trigger_priority_semantic_wins_over_syntax():
    """
    WB9b: When both semantic inconsistency AND syntax error are present
    (but no external rejection), semantic (severity 2) wins over syntax (severity 1).
    """
    from core.mvfl.mvfl_trigger import MVFLTrigger

    trigger = MVFLTrigger()
    # status='completed' + result=None → semantic
    # expected_schema requires 'task_id' but it's missing → syntax
    output = {
        "status": "completed",
        "result": None,
        # task_id deliberately missing → syntax error too
    }
    task_payload = {
        "expected_schema": {
            "task_id": {"type": "str", "required": True},
            "status": {"type": "str", "required": True},
        }
    }
    result = trigger.evaluate(output, task_payload)

    assert result.triggered is True, f"WB9b: Expected triggered=True"
    assert result.trigger_type == "semantic", (
        f"WB9b: Semantic should win over syntax, got: {result.trigger_type}"
    )
    assert result.severity == 2, f"WB9b: Severity must be 2 for semantic, got: {result.severity}"
    print("WB9b PASSED — semantic wins priority over syntax when both present")


# ---------------------------------------------------------------------------
# WB10: CorrectionLoop MAX_CORRECTION_ATTEMPTS constant = 3
# ---------------------------------------------------------------------------

def test_wb10_max_correction_attempts_is_3():
    """
    WB10: The MAX_CORRECTION_ATTEMPTS constant must equal 3.
    This is the hard limit before MemGPT escalation.
    """
    from core.mvfl.correction_loop import MAX_CORRECTION_ATTEMPTS

    assert MAX_CORRECTION_ATTEMPTS == 3, (
        f"WB10: Expected MAX_CORRECTION_ATTEMPTS=3, got {MAX_CORRECTION_ATTEMPTS}"
    )
    print("WB10 PASSED — MAX_CORRECTION_ATTEMPTS == 3")


# ---------------------------------------------------------------------------
# WB11: CorrectionLoop correction prompt always starts with "CORRECTION: "
# ---------------------------------------------------------------------------

def test_wb11_correction_prompt_starts_with_correction_prefix():
    """
    WB11: Every re-dispatched payload must have a prompt starting with
    'CORRECTION: ' followed by the trigger details.
    """
    from core.mvfl.correction_loop import CorrectionLoop

    captured = []

    async def capture_dispatch(payload):
        captured.append(dict(payload))
        return {"task_id": "t-wb11", "status": "completed", "output": "ok"}

    # First re-eval is clean → loop exits after attempt 1
    trigger_mock = _make_trigger_mock_sequence([_trigger_result(False)])
    voyager_mock = _make_voyager_mock(should_block=False)

    loop = CorrectionLoop(
        trigger=trigger_mock,
        voyager=voyager_mock,
        dispatch_fn=capture_dispatch,
    )

    initial_trigger = _trigger_result(
        True, "semantic", 2, "status=completed but result=None"
    )
    run(loop.run(
        task_payload={"task_id": "t-wb11", "prompt": "Analyze the data"},
        failed_output={"task_id": "t-wb11", "status": "completed", "result": None},
        trigger_result=initial_trigger,
    ))

    assert len(captured) == 1, f"WB11: Expected 1 dispatch call, got {len(captured)}"
    prompt = captured[0]["prompt"]
    assert prompt.startswith("CORRECTION: "), (
        f"WB11: Prompt must start with 'CORRECTION: ', got: {prompt!r}"
    )
    assert "status=completed but result=None" in prompt, (
        f"WB11: Trigger details must appear in correction prompt: {prompt!r}"
    )
    assert "Analyze the data" in prompt, (
        f"WB11: Original prompt must be preserved in correction prompt: {prompt!r}"
    )
    print("WB11 PASSED — correction prompt always starts with 'CORRECTION: '")


# ---------------------------------------------------------------------------
# WB12: MemGPTEscalation uses ESCALATION_MODEL = "claude-opus-4-6"
# ---------------------------------------------------------------------------

def test_wb12_memgpt_escalation_model_is_opus():
    """
    WB12: The ESCALATION_MODEL constant in memgpt_escalation module must be
    "claude-opus-4-6" — Opus is the highest-capability model for 3-strike resolution.
    """
    from core.mvfl.memgpt_escalation import ESCALATION_MODEL

    assert ESCALATION_MODEL == "claude-opus-4-6", (
        f"WB12: Expected ESCALATION_MODEL='claude-opus-4-6', got '{ESCALATION_MODEL}'"
    )
    print("WB12 PASSED — ESCALATION_MODEL == 'claude-opus-4-6'")


def test_wb12b_memgpt_escalation_dispatches_to_opus():
    """
    WB12b: MemGPTEscalation.escalate() must call dispatch_fn with
    ESCALATION_MODEL as the first positional argument.
    """
    from core.mvfl.memgpt_escalation import MemGPTEscalation, ESCALATION_MODEL

    dispatch_mock = AsyncMock(
        return_value={"task_id": "t-wb12", "status": "completed", "output": "Opus answer"}
    )
    escalation = MemGPTEscalation(dispatch_fn=dispatch_mock)

    run(escalation.escalate(
        task_payload={"task_id": "t-wb12", "prompt": "Urgent resolution needed"},
        failed_output={"task_id": "t-wb12", "status": "error"},
    ))

    dispatch_mock.assert_awaited_once()
    call_args = dispatch_mock.call_args
    # First positional arg must be the model name
    model_arg = call_args[0][0]
    assert model_arg == ESCALATION_MODEL, (
        f"WB12b: dispatch must be called with '{ESCALATION_MODEL}', got '{model_arg}'"
    )
    print("WB12b PASSED — MemGPTEscalation dispatches to ESCALATION_MODEL")


# ---------------------------------------------------------------------------
# WB13: VoyagerDefense BLOCK_THRESHOLD default = 0.7
# ---------------------------------------------------------------------------

def test_wb13_voyager_block_threshold_default():
    """
    WB13: The BLOCK_THRESHOLD in voyager_defense must default to 0.7.
    Scores >= 0.7 should_block, scores < 0.7 should not.
    """
    import core.mvfl.voyager_defense as vd_module
    from core.mvfl.voyager_defense import VoyagerDefense, BLOCK_THRESHOLD

    # Default threshold is 0.7 (unless overridden by env var)
    # We rely on the module default since the env is not set to override it
    original_env = os.environ.pop("VOYAGER_BLOCK_THRESHOLD", None)
    try:
        # Reload the module constant check (import is already done)
        assert BLOCK_THRESHOLD == 0.7, (
            f"WB13: Expected BLOCK_THRESHOLD=0.7, got {BLOCK_THRESHOLD}"
        )

        # Verify the threshold works correctly in score() via a mocked Qdrant client
        mock_client = MagicMock()

        # Score just at threshold (0.7) → should block
        result_at_threshold = MagicMock()
        result_at_threshold.score = 0.7
        result_at_threshold.id = "scar-1"

        mock_client.search.return_value = [result_at_threshold]
        defense_at = VoyagerDefense(qdrant_client=mock_client)
        score_at = defense_at.score({"task_id": "t", "status": "error"})
        assert score_at.should_block is True, (
            f"WB13: Score 0.7 should trigger block (>= threshold), got should_block={score_at.should_block}"
        )

        # Score just below threshold (0.69) → should NOT block
        result_below = MagicMock()
        result_below.score = 0.69
        result_below.id = "scar-2"

        mock_client.search.return_value = [result_below]
        defense_below = VoyagerDefense(qdrant_client=mock_client)
        score_below = defense_below.score({"task_id": "t", "status": "error"})
        assert score_below.should_block is False, (
            f"WB13: Score 0.69 should NOT trigger block (< threshold), got should_block={score_below.should_block}"
        )
    finally:
        if original_env is not None:
            os.environ["VOYAGER_BLOCK_THRESHOLD"] = original_env

    print("WB13 PASSED — VoyagerDefense BLOCK_THRESHOLD default is 0.7, logic verified")


# ---------------------------------------------------------------------------
# WB14: OutputValidator collects ALL errors (not fail-fast)
# ---------------------------------------------------------------------------

def test_wb14_output_validator_collects_all_errors():
    """
    WB14: OutputValidator.validate() must collect ALL schema violations
    in a single pass — never halt on the first error found.
    """
    from core.mvfl.output_validator import OutputValidator

    validator = OutputValidator()
    schema = {
        "task_id": {"type": "str", "required": True},
        "status": {"type": "str", "required": True},
        "score": {"type": "float", "required": True},
        "items": {"type": "list", "required": True},
    }
    # output missing task_id (required), has wrong type for score (str not float),
    # and missing items (required)
    output = {
        "status": "completed",
        "score": "high",    # wrong type — should be float
        # task_id missing entirely
        # items missing entirely
    }

    result = validator.validate(output, schema)

    assert result.valid is False, "WB14: Expected valid=False for multi-error output"
    assert len(result.errors) >= 3, (
        f"WB14: Expected at least 3 errors (task_id, score, items), got {len(result.errors)}: {result.errors}"
    )
    # Verify specific errors are present
    errors_str = " | ".join(result.errors)
    assert "task_id" in errors_str, f"WB14: Missing task_id error. errors: {result.errors}"
    assert "score" in errors_str, f"WB14: Missing score type error. errors: {result.errors}"
    assert "items" in errors_str, f"WB14: Missing items error. errors: {result.errors}"
    print(f"WB14 PASSED — OutputValidator collected {len(result.errors)} errors in one pass (not fail-fast)")


# ---------------------------------------------------------------------------
# WB15: MVFLInterceptor priority = 90
# ---------------------------------------------------------------------------

def test_wb15_mvfl_interceptor_priority_is_90():
    """
    WB15: MVFLInterceptor.metadata.priority must be exactly 90,
    ensuring it runs AFTER business logic interceptors (priority 10-50).
    """
    interceptor = _make_interceptor()

    assert interceptor.metadata.priority == 90, (
        f"WB15: Expected priority=90, got: {interceptor.metadata.priority}"
    )
    assert interceptor.metadata.name == "mvfl", (
        f"WB15: Expected metadata.name='mvfl', got: {interceptor.metadata.name!r}"
    )
    print("WB15 PASSED — MVFLInterceptor priority=90, name='mvfl'")


# ===========================================================================
# INTEGRATION TESTS — Fully mocked full pipeline
# ===========================================================================

# ---------------------------------------------------------------------------
# INT16: Full pipeline: trigger → voyager → correction → success on attempt 2
# ---------------------------------------------------------------------------

def test_int16_full_pipeline_success_on_attempt_2():
    """
    INT16: Fully mocked integration test.
    MVFLTrigger fires → VoyagerDefense passes → CorrectionLoop dispatches twice
    → second attempt is clean → CorrectionResult(success=True, attempts=2).
    """
    from core.mvfl.correction_loop import CorrectionLoop

    trigger_mock = _make_trigger_mock_sequence([
        _trigger_result(True, "syntax", 1, "Missing status field"),   # attempt 1 fails
        _trigger_result(False),                                        # attempt 2 clean
    ])
    voyager_mock = _make_voyager_mock(should_block=False)

    dispatch_responses = [
        {"task_id": "t-int16", "status_code": 400},           # bad — attempt 1
        {"task_id": "t-int16", "status": "completed", "output": "ok"},  # clean — attempt 2
    ]
    dispatch = AsyncMock(side_effect=dispatch_responses)

    loop = CorrectionLoop(
        trigger=trigger_mock,
        voyager=voyager_mock,
        dispatch_fn=dispatch,
    )

    initial_trigger = _trigger_result(True, "syntax", 1, "Missing status field")
    result = run(loop.run(
        task_payload={"task_id": "t-int16", "prompt": "Full pipeline test"},
        failed_output={"task_id": "t-int16"},
        trigger_result=initial_trigger,
    ))

    assert result.success is True, f"INT16: Expected success=True, got {result}"
    assert result.attempts == 2, f"INT16: Expected 2 attempts, got {result.attempts}"
    assert result.escalated is False, f"INT16: Expected not escalated"
    assert dispatch.await_count == 2, (
        f"INT16: Expected 2 dispatch calls, got {dispatch.await_count}"
    )
    print("INT16 PASSED — Full pipeline: trigger → correction → success on attempt 2")


# ---------------------------------------------------------------------------
# INT17: Full pipeline: trigger → voyager → 3 failures → escalation → Opus result
# ---------------------------------------------------------------------------

def test_int17_full_pipeline_3_failures_then_opus_escalation():
    """
    INT17: Fully mocked integration test for the worst-case path.
    MVFLTrigger fires → VoyagerDefense passes → CorrectionLoop exhausts 3 attempts
    → MemGPTEscalation called → Opus result returned.

    The escalation function receives the task_payload + failed_output and
    returns a successful Opus resolution dict.
    """
    from core.mvfl.correction_loop import CorrectionLoop, MAX_CORRECTION_ATTEMPTS

    # All 3 re-evaluations still fail
    trigger_mock = _make_trigger_mock_sequence([
        _trigger_result(True, "external_rejection", 3, "HTTP 503"),
        _trigger_result(True, "external_rejection", 3, "HTTP 503"),
        _trigger_result(True, "external_rejection", 3, "HTTP 503"),
    ])
    voyager_mock = _make_voyager_mock(should_block=False)

    bad_output = {"task_id": "t-int17", "status_code": 503}
    dispatch = AsyncMock(return_value=bad_output)

    opus_resolution = {
        "task_id": "t-int17",
        "status": "completed",
        "output": "Definitive Opus resolution",
        "model": "claude-opus-4-6",
        "escalated_by": "MVFL_3_STRIKE",
    }
    escalation_fn = AsyncMock(return_value=opus_resolution)

    loop = CorrectionLoop(
        trigger=trigger_mock,
        voyager=voyager_mock,
        dispatch_fn=dispatch,
        escalation_fn=escalation_fn,
    )

    initial_trigger = _trigger_result(True, "external_rejection", 3, "HTTP 503")
    result = run(loop.run(
        task_payload={"task_id": "t-int17", "prompt": "Mission-critical task"},
        failed_output=bad_output,
        trigger_result=initial_trigger,
    ))

    # Pipeline state
    assert result.success is False, f"INT17: Expected success=False, got {result}"
    assert result.escalated is True, f"INT17: Expected escalated=True, got {result}"
    assert result.attempts == MAX_CORRECTION_ATTEMPTS, (
        f"INT17: Expected {MAX_CORRECTION_ATTEMPTS} attempts, got {result.attempts}"
    )

    # Dispatch was called 3 times (one per correction attempt)
    assert dispatch.await_count == MAX_CORRECTION_ATTEMPTS, (
        f"INT17: Expected {MAX_CORRECTION_ATTEMPTS} dispatch calls, got {dispatch.await_count}"
    )

    # Escalation was called exactly once
    escalation_fn.assert_awaited_once()
    escalation_call = escalation_fn.call_args
    task_arg = escalation_call[0][0]
    assert task_arg["task_id"] == "t-int17", (
        f"INT17: Escalation must receive original task_payload, got: {task_arg}"
    )

    # Opus output is returned in CorrectionResult
    assert result.output.get("model") == "claude-opus-4-6", (
        f"INT17: Expected Opus model in escalated output, got: {result.output}"
    )
    assert result.output.get("status") == "completed", (
        f"INT17: Expected completed status from Opus, got: {result.output}"
    )
    print("INT17 PASSED — Full pipeline: 3 failures → escalation → Opus result returned")


# ===========================================================================
# Entry point
# ===========================================================================

if __name__ == "__main__":
    tests = [
        # BB Tests
        test_bb1_syntax_error_corrected_on_attempt_2,
        test_bb2_voyager_scar_match_triggers_correction,
        test_bb3_three_strike_triggers_memgpt_escalation,
        test_bb4_external_rejection_422_triggers_severity_3,
        test_bb5_semantic_inconsistency_completed_with_no_result,
        test_bb6_clean_output_no_trigger_no_correction,
        test_bb7_post_execute_triggered_output_gets_mvfl_corrected,
        test_bb8_pre_execute_is_identity,
        # WB Tests
        test_wb9_trigger_priority_external_wins_over_semantic,
        test_wb9b_trigger_priority_semantic_wins_over_syntax,
        test_wb10_max_correction_attempts_is_3,
        test_wb11_correction_prompt_starts_with_correction_prefix,
        test_wb12_memgpt_escalation_model_is_opus,
        test_wb12b_memgpt_escalation_dispatches_to_opus,
        test_wb13_voyager_block_threshold_default,
        test_wb14_output_validator_collects_all_errors,
        test_wb15_mvfl_interceptor_priority_is_90,
        # Integration Tests
        test_int16_full_pipeline_success_on_attempt_2,
        test_int17_full_pipeline_3_failures_then_opus_escalation,
    ]

    passed = 0
    failed = 0
    for t in tests:
        try:
            t()
            passed += 1
        except Exception as exc:
            import traceback
            print(f"FAILED: {t.__name__} — {exc}")
            traceback.print_exc()
            failed += 1

    total = len(tests)
    print(f"\n{'=' * 60}")
    print(f"Story 3.07 — MVFL Pipeline Test Suite")
    print(f"Results: {passed}/{total} passed")
    if failed:
        print(f"FAILED: {failed} test(s)")
        sys.exit(1)
    else:
        print("ALL TESTS PASSED")
        sys.exit(0)


# VERIFICATION_STAMP
# Story: 3.07 (Track B) — Test Suite — Module 3 MVFL Pipeline
# Verified By: parallel-builder (claude-sonnet-4-6)
# Verified At: 2026-02-25
# Tests: 19/19
# Coverage: All 6 Module 3 files covered (mvfl_trigger, voyager_defense,
#           output_validator, correction_loop, memgpt_escalation, mvfl_interceptor)
