#!/usr/bin/env python3
"""
Quality Gate — KB Ingestion Pipeline Module 12
===============================================
Auto-generated quiz from KB chunks + RAG accuracy evaluation.

Stories:
  12.01 — generate_quiz:      Create Q&A pairs from random KB chunks via Gemini
  12.02 — evaluate_accuracy:  Run quiz against RAG pipeline and measure hit rate
  12.03 — run_quality_gate:   Full pipeline (generate → evaluate → report)

Usage:
    python3 -m core.kb.quality_gate hubspot
    python3 -m core.kb.quality_gate hubspot --questions 10 --threshold 0.9

Dependencies:
    from core.kb.qdrant_store import search_platform
    from core.kb.embedder import embed_text
    from core.rag_query import rag_query
    from core.kb.pg_store import get_connection
    import google.genai as genai
"""

import json
import logging
import os
import random
from typing import Optional

from core.kb.qdrant_store import search_platform
from core.kb.embedder import embed_text
from core.rag_query import rag_query

logger = logging.getLogger(__name__)

# ──────────────────────────────────────────────────────────────────────────────
# Config
# ──────────────────────────────────────────────────────────────────────────────

GEMINI_MODEL = "gemini-2.0-flash"

# A broad zero-vector for chunk discovery — we sample randomly so the
# direction of the query doesn't matter; score_threshold=0.0 returns all hits.
_DISCOVERY_VECTOR_DIM = 3072
_DISCOVERY_VECTOR = [0.0] * _DISCOVERY_VECTOR_DIM
# Inject a tiny non-zero component so Qdrant doesn't reject it as a null-vector.
_DISCOVERY_VECTOR[0] = 1e-6

_QUIZ_PROMPT_TEMPLATE = """You are a QA engineer generating a quiz to test a retrieval system.

Given the following knowledge-base chunk, generate ONE question whose answer is clearly contained within the chunk text.

Requirements:
- The question must be specific and answerable from the chunk alone.
- The answer must be a short phrase or sentence (not a list) extracted or directly inferable from the chunk.
- Output ONLY valid JSON in this exact format (no markdown, no extra text):
{{"question": "<your question here>", "answer": "<short answer here>"}}

Chunk title: {title}
Chunk text:
{text}
"""

# ──────────────────────────────────────────────────────────────────────────────
# Internal helpers
# ──────────────────────────────────────────────────────────────────────────────

def _load_api_key() -> str:
    """Load GEMINI_API_KEY from environment or secrets.env file."""
    key = os.getenv("GEMINI_API_KEY", "")
    if not key:
        secrets_path = "/mnt/e/genesis-system/config/secrets.env"
        if os.path.exists(secrets_path):
            with open(secrets_path) as fh:
                for line in fh:
                    line = line.strip()
                    if line.startswith("GEMINI_API_KEY="):
                        key = line.split("=", 1)[1].strip().strip("'\"")
                        break
    return key


_genai_client = None


def _get_genai_client():
    """Return a singleton google.genai Client."""
    global _genai_client
    if _genai_client is None:
        import google.genai as genai
        api_key = _load_api_key()
        _genai_client = genai.Client(api_key=api_key)
    return _genai_client


def _generate_question_for_chunk(chunk: dict) -> Optional[dict]:
    """
    Call Gemini to produce a {question, answer} pair for a single chunk.

    Returns None on any error so the caller can skip gracefully.
    """
    client = _get_genai_client()
    prompt = _QUIZ_PROMPT_TEMPLATE.format(
        title=chunk.get("title", ""),
        text=chunk.get("text", "")[:3000],  # Guard against extremely long chunks
    )
    try:
        response = client.models.generate_content(
            model=GEMINI_MODEL,
            contents=[prompt],
        )
        raw = response.text.strip()
        # Strip markdown code fences if Gemini wraps the JSON
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        qa = json.loads(raw)
        if "question" not in qa or "answer" not in qa:
            logger.warning("Gemini response missing question/answer keys: %s", raw[:200])
            return None
        return qa
    except Exception as exc:
        logger.warning("generate_question_for_chunk failed: %s", exc)
        return None


def _text_overlap(a: str, b: str, min_words: int = 4) -> bool:
    """
    Heuristic: return True if at least min_words consecutive words from `a`
    appear as a sub-sequence in `b` (case-insensitive).

    Used to decide whether a RAG result text contains the expected answer.
    """
    a_words = a.lower().split()
    b_lower = b.lower()
    if not a_words:
        return False
    # Sliding window of min_words length
    for i in range(len(a_words) - min_words + 1):
        phrase = " ".join(a_words[i : i + min_words])
        if phrase in b_lower:
            return True
    # Fallback: check if more than 50% of unique words from answer appear in result
    a_unique = set(w for w in a_words if len(w) > 3)
    if not a_unique:
        return False
    b_words = set(b_lower.split())
    overlap_ratio = len(a_unique & b_words) / len(a_unique)
    return overlap_ratio >= 0.5


# ──────────────────────────────────────────────────────────────────────────────
# Story 12.01 — generate_quiz
# ──────────────────────────────────────────────────────────────────────────────

def generate_quiz(
    platform: str,
    num_questions: int = 20,
    customer_id: Optional[str] = None,
) -> list[dict]:
    """
    Generate quiz questions from ingested KB to test RAG accuracy.

    Steps:
    1. Get a broad set of chunks from Qdrant for the platform (up to 200).
    2. Randomly sample num_questions chunks (or all if fewer available).
    3. For each chunk, call Gemini to generate a question answerable from that chunk.
    4. Return list of dicts:
       {question, expected_answer, source_chunk_id, source_text, source_url}

    Chunks that fail Gemini generation are silently skipped.

    Args:
        platform:      The platform to build the quiz for (e.g., "hubspot").
        num_questions: Desired number of quiz items (actual may be lower if
                       fewer chunks exist or Gemini fails for some).
        customer_id:   Optional customer scope for multi-tenant isolation.

    Returns:
        List of quiz item dicts.
    """
    # Use a minimal embed to get a query vector for broad discovery.
    # We embed a generic phrase so all platform chunks are reachable.
    try:
        query_vector = embed_text(f"{platform} documentation knowledge base")
    except Exception as exc:
        logger.warning("embed_text failed during quiz generation (%s), using discovery vector", exc)
        query_vector = _DISCOVERY_VECTOR

    # Fetch a large pool of chunks so we have variety to sample from
    chunks = search_platform(
        query_vector=query_vector,
        platform=platform,
        customer_id=customer_id,
        top_k=200,
        score_threshold=0.0,
    )

    if not chunks:
        logger.info("generate_quiz: no chunks found for platform=%s", platform)
        return []

    # Randomly sample without replacement (or take all if fewer than requested)
    sample_size = min(num_questions, len(chunks))
    sampled = random.sample(chunks, sample_size)

    quiz: list[dict] = []
    for chunk in sampled:
        qa = _generate_question_for_chunk(chunk)
        if qa is None:
            continue
        quiz.append({
            "question": qa["question"],
            "expected_answer": qa["answer"],
            "source_chunk_id": chunk.get("id", ""),
            "source_text": chunk.get("text", ""),
            "source_url": chunk.get("source_url", ""),
            "source_title": chunk.get("title", ""),
        })

    logger.info(
        "generate_quiz: generated %d/%d questions for platform=%s",
        len(quiz), sample_size, platform,
    )
    return quiz


# ──────────────────────────────────────────────────────────────────────────────
# Story 12.02 — evaluate_accuracy
# ──────────────────────────────────────────────────────────────────────────────

def evaluate_accuracy(
    quiz: list[dict],
    platform: str,
    customer_id: Optional[str] = None,
    pass_threshold: float = 0.80,
    top_k: int = 3,
) -> dict:
    """
    Run quiz against the RAG pipeline and measure retrieval accuracy.

    For each quiz item:
    1. Call rag_query(question, platform, top_k=top_k).
    2. Mark as "correct" if any result's source_url matches the quiz source_url
       OR if any result's text has significant overlap with the expected_answer.

    Args:
        quiz:            Output from generate_quiz().
        platform:        Platform to evaluate.
        customer_id:     Optional customer scope (reserved for future use).
        pass_threshold:  Minimum accuracy to consider the gate passed (0.0–1.0).
        top_k:           Number of RAG results to check per question.

    Returns:
        {
            "platform":        str,
            "total_questions": int,
            "correct":         int,
            "accuracy":        float,
            "passed":          bool,
            "threshold":       float,
            "details":         list[dict],
            "recommendations": list[str],
        }
    """
    if not quiz:
        return {
            "platform": platform,
            "total_questions": 0,
            "correct": 0,
            "accuracy": 0.0,
            "passed": False,
            "threshold": pass_threshold,
            "details": [],
            "recommendations": ["No quiz items provided — run generate_quiz() first."],
        }

    correct = 0
    details: list[dict] = []

    for item in quiz:
        question = item["question"]
        expected_answer = item["expected_answer"]
        source_url = item.get("source_url", "")

        # Query the RAG pipeline.
        # rag_query doesn't support a platform parameter directly; we pass
        # top_k and rely on URL/text matching to isolate platform results.
        try:
            results = rag_query(question, top_k=top_k)
        except Exception as exc:
            logger.warning("rag_query failed for question=%r: %s", question[:80], exc)
            results = []

        # Determine if this question was answered correctly
        found = False
        top_result_score: float = 0.0

        for result in results:
            if top_result_score == 0.0:
                top_result_score = result.get("score", 0.0)

            # Match 1: source URL matches
            result_url = result.get("source_url", "")
            if source_url and result_url and source_url == result_url:
                found = True
                break

            # Match 2: text overlap with expected answer
            result_text = result.get("text", "")
            if result_text and _text_overlap(expected_answer, result_text):
                found = True
                break

        if found:
            correct += 1

        details.append({
            "question": question,
            "expected_answer": expected_answer,
            "source_url": source_url,
            "found_in_top_k": found,
            "top_result_score": round(top_result_score, 4),
        })

    total = len(quiz)
    accuracy = correct / total if total > 0 else 0.0
    passed = accuracy >= pass_threshold

    # Generate recommendations when gate fails
    recommendations: list[str] = []
    if not passed:
        gap = pass_threshold - accuracy
        recommendations.append(
            f"Accuracy {accuracy:.1%} is {gap:.1%} below the {pass_threshold:.1%} threshold."
        )
        if accuracy < 0.5:
            recommendations.append(
                "Low accuracy (<50%) suggests the platform KB may have very few indexed chunks. "
                "Re-run the ingestion pipeline to populate Qdrant."
            )
        if accuracy >= 0.5:
            recommendations.append(
                "Consider increasing chunk overlap or reducing chunk size to improve retrieval recall."
            )
        recommendations.append(
            "Review failed items in 'details' to identify poorly chunked or ambiguous content."
        )

    return {
        "platform": platform,
        "total_questions": total,
        "correct": correct,
        "accuracy": round(accuracy, 4),
        "passed": passed,
        "threshold": pass_threshold,
        "details": details,
        "recommendations": recommendations,
    }


# ──────────────────────────────────────────────────────────────────────────────
# Story 12.03 — run_quality_gate
# ──────────────────────────────────────────────────────────────────────────────

async def run_quality_gate(
    platform: str,
    customer_id: Optional[str] = None,
    num_questions: int = 20,
    pass_threshold: float = 0.80,
) -> dict:
    """
    Full quality gate pipeline: generate quiz → evaluate → return combined report.

    This is an async function so it can be awaited from async contexts
    (e.g., an orchestrator or FastAPI endpoint), but internally all work
    is synchronous — no I/O concurrency is introduced here.

    Args:
        platform:       Platform KB to evaluate.
        customer_id:    Optional customer scope.
        num_questions:  Number of quiz questions to generate.
        pass_threshold: Accuracy fraction required to pass (default 0.80).

    Returns:
        Combined report dict.  If no chunks found, returns a minimal
        NO_DATA report.
    """
    logger.info(
        "run_quality_gate: starting for platform=%s, questions=%d, threshold=%.2f",
        platform, num_questions, pass_threshold,
    )

    quiz = generate_quiz(platform, num_questions, customer_id)

    if not quiz:
        logger.warning("run_quality_gate: no chunks found for platform=%s — returning NO_DATA", platform)
        return {
            "platform": platform,
            "status": "NO_DATA",
            "message": f"No chunks found for platform '{platform}'. "
                       "Ensure KB ingestion has been run before the quality gate.",
        }

    result = evaluate_accuracy(quiz, platform, customer_id, pass_threshold)
    result["quiz"] = quiz          # Attach full quiz for audit trail
    result["status"] = "PASSED" if result["passed"] else "FAILED"

    logger.info(
        "run_quality_gate: platform=%s accuracy=%.1f%% status=%s",
        platform, result["accuracy"] * 100, result["status"],
    )
    return result


# ──────────────────────────────────────────────────────────────────────────────
# CLI entry point
# ──────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import argparse
    import asyncio

    parser = argparse.ArgumentParser(
        description="KB Quality Gate — auto quiz + RAG accuracy evaluation"
    )
    parser.add_argument("platform", help="Platform to evaluate (e.g., hubspot)")
    parser.add_argument(
        "--questions", type=int, default=20,
        help="Number of quiz questions to generate (default: 20)"
    )
    parser.add_argument(
        "--threshold", type=float, default=0.80,
        help="Accuracy pass threshold 0.0–1.0 (default: 0.80)"
    )
    parser.add_argument(
        "--customer-id", default=None,
        help="Optional customer_id scope for multi-tenant isolation"
    )
    args = parser.parse_args()

    report = asyncio.run(
        run_quality_gate(
            platform=args.platform,
            customer_id=args.customer_id,
            num_questions=args.questions,
            pass_threshold=args.threshold,
        )
    )
    print(json.dumps(report, indent=2, default=str))


# VERIFICATION_STAMP
# Story: M12 — Quality Gate (Stories 12.01–12.03)
# Verified By: parallel-builder (claude-sonnet-4-6)
# Verified At: 2026-02-26
# Tests: see tests/kb/test_m12_quality_gate_integration.py
# Coverage: 100% of stories implemented