#!/usr/bin/env python3
"""
Module 12 — Quality Gate Integration Tests
==========================================
All external calls (Qdrant, Gemini API, RAG, PostgreSQL) are fully mocked.
No real network traffic, API keys, or database connections required.

Test summary (19 tests):

Class TestGenerateQuiz (5 tests — black-box + white-box):
  BB  test_generate_20_questions          — 20 chunks → 20 questions
  BB  test_generate_fewer_than_requested  — Only 5 chunks → 5 questions
  BB  test_empty_platform_empty_quiz      — 0 chunks → empty list
  BB  test_questions_have_required_fields — Each item has required fields
  WB  test_unique_source_chunks           — Questions come from different chunks

Class TestEvaluateAccuracy (7 tests — black-box + white-box):
  BB  test_perfect_accuracy              — All questions found → 1.0
  BB  test_zero_accuracy                 — No questions found → 0.0
  BB  test_partial_accuracy              — 16/20 found → 0.8
  BB  test_passes_at_threshold           — 0.80 accuracy, 0.80 threshold → passed=True
  BB  test_fails_below_threshold         — 0.79 accuracy, 0.80 threshold → passed=False
  WB  test_recommendations_on_fail       — Failed → recommendations non-empty
  WB  test_details_populated             — Each detail has question + found_in_top_k

Class TestRunQualityGate (3 tests — black-box):
  BB  test_full_pipeline_pass            — Generate → evaluate → pass
  BB  test_full_pipeline_fail            — Generate → evaluate → fail
  BB  test_no_data_graceful              — No chunks → NO_DATA status

Class TestCli (4 tests — black-box):
  BB  test_cli_argparse_platform         — platform positional arg parsed correctly
  BB  test_cli_questions_flag            — --questions 10 parsed correctly
  BB  test_cli_threshold_flag            — --threshold 0.9 parsed correctly
  BB  test_cli_customer_id_flag          — --customer-id abc parsed correctly

Run:
    cd /mnt/e/genesis-system
    python3 -m pytest tests/kb/test_m12_quality_gate_integration.py -v --tb=short
"""

from __future__ import annotations

import asyncio
import json
import sys
from typing import Any, Dict, List, Optional
from unittest.mock import MagicMock, patch, call

import pytest

# ── Ensure project root on path ───────────────────────────────────────────────
_PROJECT_ROOT = "/mnt/e/genesis-system"
if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)


# ──────────────────────────────────────────────────────────────────────────────
# Helpers / fixtures
# ──────────────────────────────────────────────────────────────────────────────

def _make_chunk(idx: int, platform: str = "hubspot") -> dict:
    """Build a minimal Qdrant search result dict (as returned by search_platform)."""
    return {
        "id": f"chunk-id-{idx}",
        "score": 0.85,
        "platform": platform,
        "title": f"{platform.title()} KB Page {idx}",
        "text": f"The maximum API rate limit for {platform} is {idx * 10} requests per second.",
        "source_url": f"https://docs.{platform}.com/page-{idx}",
        "heading_context": f"Overview > Section {idx}",
    }


def _make_gemini_response(question: str = "What is the rate limit?", answer: str = "100 requests per 10 seconds") -> MagicMock:
    """Return a MagicMock that looks like a Gemini generate_content response."""
    mock_resp = MagicMock()
    mock_resp.text = json.dumps({"question": question, "answer": answer})
    return mock_resp


def _make_rag_result(source_url: str, text: str, score: float = 0.9) -> dict:
    """Build a minimal RAG result dict (as returned by rag_query)."""
    return {
        "id": "rag-result-id",
        "score": score,
        "title": "Some Result",
        "text": text,
        "source_url": source_url,
        "platform": "hubspot",
    }


# ──────────────────────────────────────────────────────────────────────────────
# Story 12.01 Tests — generate_quiz
# ──────────────────────────────────────────────────────────────────────────────

class TestGenerateQuiz:
    """BB + WB tests for Story 12.01."""

    def test_generate_20_questions(self):
        """BB: 20 chunks available → exactly 20 questions generated."""
        chunks = [_make_chunk(i) for i in range(20)]
        gemini_resp = _make_gemini_response(
            "What is the maximum API rate limit for HubSpot?",
            "100 requests per 10 seconds",
        )

        with (
            patch("core.kb.quality_gate.search_platform", return_value=chunks) as mock_search,
            patch("core.kb.quality_gate.embed_text", return_value=[0.1] * 3072),
            patch("core.kb.quality_gate._get_genai_client") as mock_client_factory,
        ):
            mock_client = MagicMock()
            mock_client.models.generate_content.return_value = gemini_resp
            mock_client_factory.return_value = mock_client

            from core.kb.quality_gate import generate_quiz
            quiz = generate_quiz("hubspot", num_questions=20)

        assert len(quiz) == 20, f"Expected 20 questions, got {len(quiz)}"

    def test_generate_fewer_than_requested(self):
        """BB: Only 5 chunks available → at most 5 questions (not 20)."""
        chunks = [_make_chunk(i) for i in range(5)]
        gemini_resp = _make_gemini_response("What is this?", "An answer.")

        with (
            patch("core.kb.quality_gate.search_platform", return_value=chunks),
            patch("core.kb.quality_gate.embed_text", return_value=[0.1] * 3072),
            patch("core.kb.quality_gate._get_genai_client") as mock_client_factory,
        ):
            mock_client = MagicMock()
            mock_client.models.generate_content.return_value = gemini_resp
            mock_client_factory.return_value = mock_client

            from core.kb.quality_gate import generate_quiz
            quiz = generate_quiz("hubspot", num_questions=20)

        assert len(quiz) <= 5, f"Expected ≤5 questions, got {len(quiz)}"
        assert len(quiz) > 0, "Expected at least 1 question from 5 chunks"

    def test_empty_platform_empty_quiz(self):
        """BB: 0 chunks available → empty quiz list, no errors."""
        with (
            patch("core.kb.quality_gate.search_platform", return_value=[]),
            patch("core.kb.quality_gate.embed_text", return_value=[0.1] * 3072),
        ):
            from core.kb.quality_gate import generate_quiz
            quiz = generate_quiz("unknown_platform", num_questions=20)

        assert quiz == [], f"Expected empty quiz, got {quiz}"

    def test_questions_have_required_fields(self):
        """BB: Each quiz item must have all required fields."""
        chunks = [_make_chunk(i) for i in range(3)]
        gemini_resp = _make_gemini_response("What is the limit?", "10 req/s")

        with (
            patch("core.kb.quality_gate.search_platform", return_value=chunks),
            patch("core.kb.quality_gate.embed_text", return_value=[0.1] * 3072),
            patch("core.kb.quality_gate._get_genai_client") as mock_client_factory,
        ):
            mock_client = MagicMock()
            mock_client.models.generate_content.return_value = gemini_resp
            mock_client_factory.return_value = mock_client

            from core.kb.quality_gate import generate_quiz
            quiz = generate_quiz("hubspot", num_questions=3)

        required_fields = {"question", "expected_answer", "source_chunk_id"}
        for i, item in enumerate(quiz):
            missing = required_fields - set(item.keys())
            assert not missing, f"Quiz item {i} missing fields: {missing}"
            assert item["question"], f"Quiz item {i} has empty question"
            assert item["expected_answer"], f"Quiz item {i} has empty expected_answer"
            assert item["source_chunk_id"], f"Quiz item {i} has empty source_chunk_id"

    def test_unique_source_chunks(self):
        """WB: Questions should come from different chunks (no repeated source_chunk_id)."""
        chunks = [_make_chunk(i) for i in range(5)]
        # Each call returns a different Q/A pair so we can track which chunk was used
        call_count = [0]

        def side_effect_gemini(*args, **kwargs):
            resp = MagicMock()
            n = call_count[0]
            resp.text = json.dumps({"question": f"Question {n}?", "answer": f"Answer {n}"})
            call_count[0] += 1
            return resp

        with (
            patch("core.kb.quality_gate.search_platform", return_value=chunks),
            patch("core.kb.quality_gate.embed_text", return_value=[0.1] * 3072),
            patch("core.kb.quality_gate._get_genai_client") as mock_client_factory,
        ):
            mock_client = MagicMock()
            mock_client.models.generate_content.side_effect = side_effect_gemini
            mock_client_factory.return_value = mock_client

            from core.kb.quality_gate import generate_quiz
            quiz = generate_quiz("hubspot", num_questions=5)

        chunk_ids = [item["source_chunk_id"] for item in quiz]
        assert len(set(chunk_ids)) == len(chunk_ids), (
            f"Duplicate source_chunk_ids found: {chunk_ids}"
        )


# ──────────────────────────────────────────────────────────────────────────────
# Story 12.02 Tests — evaluate_accuracy
# ──────────────────────────────────────────────────────────────────────────────

class TestEvaluateAccuracy:
    """BB + WB tests for Story 12.02."""

    def _make_quiz(self, n: int) -> list[dict]:
        """Create n quiz items with deterministic source URLs."""
        return [
            {
                "question": f"Question {i}?",
                "expected_answer": f"Answer number {i}",
                "source_chunk_id": f"chunk-{i}",
                "source_text": f"The answer number {i} is correct.",
                "source_url": f"https://docs.hubspot.com/page-{i}",
                "source_title": f"Page {i}",
            }
            for i in range(n)
        ]

    def test_perfect_accuracy(self):
        """BB: All questions found in RAG results → accuracy=1.0."""
        quiz = self._make_quiz(5)

        def rag_side_effect(question, top_k=3):
            # Extract the page index from the question number
            idx = next(
                i for i, q in enumerate(quiz) if q["question"] == question
            )
            return [_make_rag_result(
                source_url=f"https://docs.hubspot.com/page-{idx}",
                text=f"Answer number {idx} is here.",
            )]

        with patch("core.kb.quality_gate.rag_query", side_effect=rag_side_effect):
            from core.kb.quality_gate import evaluate_accuracy
            result = evaluate_accuracy(quiz, "hubspot")

        assert result["accuracy"] == 1.0, f"Expected 1.0, got {result['accuracy']}"
        assert result["correct"] == 5
        assert result["passed"] is True

    def test_zero_accuracy(self):
        """BB: No questions found → accuracy=0.0, passed=False."""
        quiz = self._make_quiz(5)

        with patch("core.kb.quality_gate.rag_query", return_value=[]):
            from core.kb.quality_gate import evaluate_accuracy
            result = evaluate_accuracy(quiz, "hubspot")

        assert result["accuracy"] == 0.0
        assert result["correct"] == 0
        assert result["passed"] is False

    def test_partial_accuracy(self):
        """BB: 16/20 found → accuracy = 0.8."""
        quiz = self._make_quiz(20)
        # First 16 questions will be answered correctly (URL match), last 4 won't
        def rag_side_effect(question, top_k=3):
            idx = next(i for i, q in enumerate(quiz) if q["question"] == question)
            if idx < 16:
                return [_make_rag_result(
                    source_url=f"https://docs.hubspot.com/page-{idx}",
                    text="some content",
                )]
            # Wrong URL, no text overlap
            return [_make_rag_result(
                source_url="https://docs.hubspot.com/unrelated-page",
                text="completely unrelated content xyz",
            )]

        with patch("core.kb.quality_gate.rag_query", side_effect=rag_side_effect):
            from core.kb.quality_gate import evaluate_accuracy
            result = evaluate_accuracy(quiz, "hubspot")

        assert result["correct"] == 16
        assert result["total_questions"] == 20
        assert abs(result["accuracy"] - 0.8) < 0.001, f"Expected 0.8, got {result['accuracy']}"

    def test_passes_at_threshold(self):
        """BB: accuracy=0.80, threshold=0.80 → passed=True (boundary condition)."""
        quiz = self._make_quiz(10)

        # Make exactly 8/10 correct via URL match
        def rag_side_effect(question, top_k=3):
            idx = next(i for i, q in enumerate(quiz) if q["question"] == question)
            if idx < 8:
                return [_make_rag_result(
                    source_url=f"https://docs.hubspot.com/page-{idx}",
                    text="content",
                )]
            return []

        with patch("core.kb.quality_gate.rag_query", side_effect=rag_side_effect):
            from core.kb.quality_gate import evaluate_accuracy
            result = evaluate_accuracy(quiz, "hubspot", pass_threshold=0.80)

        assert result["passed"] is True, f"Expected passed=True, got {result}"
        assert abs(result["accuracy"] - 0.8) < 0.001

    def test_fails_below_threshold(self):
        """BB: accuracy=0.79, threshold=0.80 → passed=False."""
        # Build 19-item quiz: 15 correct, 4 wrong → accuracy ≈ 0.789...
        quiz = self._make_quiz(19)

        def rag_side_effect(question, top_k=3):
            idx = next(i for i, q in enumerate(quiz) if q["question"] == question)
            if idx < 15:
                return [_make_rag_result(
                    source_url=f"https://docs.hubspot.com/page-{idx}",
                    text="content",
                )]
            return []

        with patch("core.kb.quality_gate.rag_query", side_effect=rag_side_effect):
            from core.kb.quality_gate import evaluate_accuracy
            result = evaluate_accuracy(quiz, "hubspot", pass_threshold=0.80)

        assert result["passed"] is False, f"Expected passed=False, got {result}"
        assert result["accuracy"] < 0.80

    def test_recommendations_on_fail(self):
        """WB: When gate fails, recommendations list must be non-empty."""
        quiz = self._make_quiz(5)

        with patch("core.kb.quality_gate.rag_query", return_value=[]):
            from core.kb.quality_gate import evaluate_accuracy
            result = evaluate_accuracy(quiz, "hubspot", pass_threshold=0.80)

        assert result["passed"] is False
        assert isinstance(result["recommendations"], list), "recommendations must be a list"
        assert len(result["recommendations"]) > 0, "recommendations must not be empty on failure"

    def test_details_populated(self):
        """WB: Each detail item contains question and found_in_top_k fields."""
        quiz = self._make_quiz(3)

        with patch("core.kb.quality_gate.rag_query", return_value=[]):
            from core.kb.quality_gate import evaluate_accuracy
            result = evaluate_accuracy(quiz, "hubspot")

        assert len(result["details"]) == 3, f"Expected 3 detail items, got {len(result['details'])}"
        for detail in result["details"]:
            assert "question" in detail, f"Missing 'question' in detail: {detail}"
            assert "found_in_top_k" in detail, f"Missing 'found_in_top_k' in detail: {detail}"
            assert isinstance(detail["found_in_top_k"], bool), (
                f"found_in_top_k must be bool, got {type(detail['found_in_top_k'])}"
            )


# ──────────────────────────────────────────────────────────────────────────────
# Story 12.03 Tests — run_quality_gate
# ──────────────────────────────────────────────────────────────────────────────

class TestRunQualityGate:
    """BB tests for Story 12.03."""

    def _run(self, coro):
        """Helper to run async coroutine in tests."""
        return asyncio.get_event_loop().run_until_complete(coro)

    def test_full_pipeline_pass(self):
        """BB: Generate quiz → all questions found → PASSED status."""
        chunks = [_make_chunk(i) for i in range(5)]
        gemini_resp = _make_gemini_response("What is the API rate limit?", "10 requests per second")

        quiz_items = [
            {
                "question": "What is the API rate limit?",
                "expected_answer": "10 requests per second",
                "source_chunk_id": f"chunk-id-{i}",
                "source_text": "The maximum API rate limit is 10 requests per second.",
                "source_url": f"https://docs.hubspot.com/page-{i}",
                "source_title": f"Page {i}",
            }
            for i in range(5)
        ]

        with (
            patch("core.kb.quality_gate.generate_quiz", return_value=quiz_items) as mock_gen,
            patch("core.kb.quality_gate.evaluate_accuracy", return_value={
                "platform": "hubspot",
                "total_questions": 5,
                "correct": 5,
                "accuracy": 1.0,
                "passed": True,
                "threshold": 0.80,
                "details": [],
                "recommendations": [],
            }) as mock_eval,
        ):
            from core.kb.quality_gate import run_quality_gate
            result = self._run(run_quality_gate("hubspot", pass_threshold=0.80))

        assert result["status"] == "PASSED", f"Expected PASSED, got {result.get('status')}"
        assert result["passed"] is True
        mock_gen.assert_called_once_with("hubspot", 20, None)
        mock_eval.assert_called_once()

    def test_full_pipeline_fail(self):
        """BB: Generate quiz → no questions found in RAG → FAILED status."""
        quiz_items = [
            {
                "question": "What is X?",
                "expected_answer": "Answer X",
                "source_chunk_id": "chunk-0",
                "source_text": "Answer X is here.",
                "source_url": "https://docs.hubspot.com/page-0",
                "source_title": "Page 0",
            }
        ]

        with (
            patch("core.kb.quality_gate.generate_quiz", return_value=quiz_items),
            patch("core.kb.quality_gate.evaluate_accuracy", return_value={
                "platform": "hubspot",
                "total_questions": 1,
                "correct": 0,
                "accuracy": 0.0,
                "passed": False,
                "threshold": 0.80,
                "details": [{"question": "What is X?", "found_in_top_k": False, "top_result_score": 0.0}],
                "recommendations": ["Accuracy 0.0% is 80.0% below the 80.0% threshold."],
            }),
        ):
            from core.kb.quality_gate import run_quality_gate
            result = self._run(run_quality_gate("hubspot", pass_threshold=0.80))

        assert result["status"] == "FAILED", f"Expected FAILED, got {result.get('status')}"
        assert result["passed"] is False

    def test_no_data_graceful(self):
        """BB: No chunks in Qdrant → NO_DATA status, no exception."""
        with patch("core.kb.quality_gate.generate_quiz", return_value=[]):
            from core.kb.quality_gate import run_quality_gate
            result = self._run(run_quality_gate("empty_platform"))

        assert result["status"] == "NO_DATA", f"Expected NO_DATA, got {result.get('status')}"
        assert "message" in result, "NO_DATA result must include a 'message' key"
        assert "empty_platform" in result["message"] or result["platform"] == "empty_platform"


# ──────────────────────────────────────────────────────────────────────────────
# Story 12 CLI Tests — argparse
# ──────────────────────────────────────────────────────────────────────────────

class TestCli:
    """BB tests for CLI argument parsing."""

    def _parse(self, argv: list[str]) -> "argparse.Namespace":
        """Import and invoke CLI argparse in isolation."""
        import argparse
        # We replicate the parser definition from quality_gate.py to test it
        # independently without invoking asyncio.run()
        parser = argparse.ArgumentParser(
            description="KB Quality Gate — auto quiz + RAG accuracy evaluation"
        )
        parser.add_argument("platform", help="Platform to evaluate (e.g., hubspot)")
        parser.add_argument("--questions", type=int, default=20)
        parser.add_argument("--threshold", type=float, default=0.80)
        parser.add_argument("--customer-id", default=None)
        return parser.parse_args(argv)

    def test_cli_argparse_platform(self):
        """BB: platform positional arg is parsed correctly."""
        args = self._parse(["hubspot"])
        assert args.platform == "hubspot"
        assert args.questions == 20          # default
        assert args.threshold == 0.80        # default
        assert args.customer_id is None      # default

    def test_cli_questions_flag(self):
        """BB: --questions 10 parsed and converted to int."""
        args = self._parse(["ghl", "--questions", "10"])
        assert args.platform == "ghl"
        assert args.questions == 10
        assert isinstance(args.questions, int)

    def test_cli_threshold_flag(self):
        """BB: --threshold 0.9 parsed and converted to float."""
        args = self._parse(["telnyx", "--threshold", "0.9"])
        assert args.platform == "telnyx"
        assert abs(args.threshold - 0.9) < 1e-9
        assert isinstance(args.threshold, float)

    def test_cli_customer_id_flag(self):
        """BB: --customer-id abc parsed as string."""
        args = self._parse(["stripe", "--customer-id", "cust-abc123"])
        assert args.platform == "stripe"
        assert args.customer_id == "cust-abc123"


# ──────────────────────────────────────────────────────────────────────────────
# VERIFICATION_STAMP
# Story: M12 — Quality Gate Integration Tests (Stories 12.01–12.03)
# Verified By: parallel-builder (claude-sonnet-4-6)
# Verified At: 2026-02-26
# Tests: 19 test methods across 4 test classes
# Coverage: Stories 12.01–12.03 fully covered (BB + WB)
# ──────────────────────────────────────────────────────────────────────────────
