#!/usr/bin/env python3
"""
Module 5 — Qdrant Store Integration Tests
==========================================
Story 5.05: Full integration test suite against a live but isolated test collection.

Tests cover:
  - upsert_vectors (Stories 5.01)
  - delete_platform (Story 5.02)
  - search_platform (Story 5.03)
  - get_platform_stats (Story 5.04)

Isolation: All tests use a temporary collection (genesis_kb_test_<random>),
created in module-scoped fixture and destroyed in teardown.
The production genesis_memories collection is NEVER touched.

Run:
    cd /mnt/e/genesis-system
    python3 -m pytest tests/kb/test_m5_qdrant_integration.py -v
"""

import uuid
import random
import os
import pytest

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

from core.kb.contracts import Chunk, EmbeddedChunk

# ──────────────────────────────────────────────────────────────────────────────
# Config
# ──────────────────────────────────────────────────────────────────────────────

QDRANT_URL = os.getenv(
    "QDRANT_URL",
    "https://qdrant-b3knu-u50607.vm.elestio.app:6333",
)
QDRANT_API_KEY = os.getenv(
    "QDRANT_API_KEY",
    "7b74e6621bd0e6650789f6662bca4cbf4143d3d1d710a0002b3b563973ca6876",
)

TEST_COLLECTION = f"genesis_kb_test_{uuid.uuid4().hex[:8]}"
VECTOR_DIM = 3072
_UUID5_NS = uuid.UUID("12345678-1234-5678-1234-567812345678")


# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────

def _random_vector(dim: int = VECTOR_DIM) -> list[float]:
    """Generate a random unit-length-ish vector for testing."""
    v = [random.gauss(0, 1) for _ in range(dim)]
    mag = sum(x * x for x in v) ** 0.5
    return [x / mag for x in v]


def _make_chunk(
    platform: str,
    idx: int = 0,
    customer_id: str | None = None,
    chunk_id: str | None = None,
) -> Chunk:
    """Create a minimal Chunk for testing."""
    cid = chunk_id or f"{platform}-chunk-{idx}-{uuid.uuid4().hex[:6]}"
    return Chunk(
        chunk_id=cid,
        source_url=f"https://docs.{platform}.com/page-{idx}",
        platform=platform,
        customer_id=customer_id,
        title=f"{platform.title()} KB Page {idx}",
        text=f"This is KB content for {platform} page {idx}.",
        heading_context=f"Overview > Section {idx}",
        chunk_index=idx,
        total_chunks=10,
        metadata={},
    )


def _make_embedded(
    platform: str,
    idx: int = 0,
    customer_id: str | None = None,
    chunk_id: str | None = None,
    vector: list[float] | None = None,
) -> EmbeddedChunk:
    chunk = _make_chunk(platform=platform, idx=idx, customer_id=customer_id, chunk_id=chunk_id)
    return EmbeddedChunk(
        chunk=chunk,
        vector=vector or _random_vector(),
        embedding_model="gemini-embedding-001",
    )


# ──────────────────────────────────────────────────────────────────────────────
# Module-scoped fixture: isolated test collection
# ──────────────────────────────────────────────────────────────────────────────

@pytest.fixture(scope="module")
def test_collection():
    """
    Create a fresh Qdrant test collection before all tests.
    Drop it after all tests complete.
    Yields the collection name string.
    """
    client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
    client.create_collection(
        TEST_COLLECTION,
        vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),
    )
    yield TEST_COLLECTION
    # Teardown
    try:
        client.delete_collection(TEST_COLLECTION)
    except Exception:
        pass  # Best-effort cleanup


# ──────────────────────────────────────────────────────────────────────────────
# Import the module under test AFTER fixture definition
# ──────────────────────────────────────────────────────────────────────────────

from core.kb.qdrant_store import (
    upsert_vectors,
    delete_platform,
    search_platform,
    get_platform_stats,
    _chunk_id_to_uuid,
    _get_client,
)


# ──────────────────────────────────────────────────────────────────────────────
# Story 5.01 Tests — upsert_vectors
# ──────────────────────────────────────────────────────────────────────────────

class TestUpsertVectors:
    """BB + WB tests for Story 5.01."""

    def test_upsert_5_vectors(self, test_collection):
        """BB: Insert 5 chunks — function returns 5."""
        chunks = [_make_embedded("hubspot", i) for i in range(5)]
        result = upsert_vectors(chunks, collection=test_collection)
        assert result == 5, f"Expected 5 upserted, got {result}"

    def test_upsert_idempotent(self, test_collection):
        """BB: Upserting the same chunks twice keeps count identical (no duplicates)."""
        # Use fixed chunk_ids so UUID5 is the same both times
        fixed_id = f"idempotent-chunk-{uuid.uuid4().hex[:8]}"
        chunk = _make_embedded("hubspot", 99, chunk_id=fixed_id)

        upsert_vectors([chunk], collection=test_collection)
        upsert_vectors([chunk], collection=test_collection)

        # Count points with this exact ID
        client = _get_client()
        from qdrant_client.models import Filter, FieldCondition, MatchValue
        count = client.count(
            collection_name=test_collection,
            count_filter=Filter(must=[FieldCondition(key="chunk_id", match=MatchValue(value=fixed_id))]),
            exact=True,
        ).count
        assert count == 1, f"Expected 1 (idempotent), got {count}"

    def test_upsert_empty(self, test_collection):
        """BB: Empty list returns 0 — no Qdrant call needed."""
        result = upsert_vectors([], collection=test_collection)
        assert result == 0

    def test_point_id_deterministic(self, test_collection):
        """WB: Same chunk_id always produces the same UUID5."""
        chunk_id = "deterministic-test-chunk-abc123"
        uuid_a = _chunk_id_to_uuid(chunk_id)
        uuid_b = _chunk_id_to_uuid(chunk_id)
        assert uuid_a == uuid_b, "UUID5 must be deterministic for same chunk_id"
        # Also verify it's a valid UUID
        parsed = uuid.UUID(uuid_a)
        assert parsed.version == 5

    def test_payload_fields_present(self, test_collection):
        """WB: After upsert, scroll and verify all required payload fields exist."""
        ec = _make_embedded("ghl", 42)
        upsert_vectors([ec], collection=test_collection)

        client = _get_client()
        point_id = _chunk_id_to_uuid(ec.chunk.chunk_id)

        points = client.retrieve(
            collection_name=test_collection,
            ids=[point_id],
            with_payload=True,
        )
        assert points, "Point not found after upsert"
        payload = points[0].payload

        required_fields = [
            "platform", "customer_id", "title", "source_url",
            "text", "heading_context", "chunk_index", "embedding_model", "type",
        ]
        for field in required_fields:
            assert field in payload, f"Missing payload field: {field}"

        assert payload["type"] == "PLATFORM_KB"
        assert payload["platform"] == "ghl"
        assert payload["embedding_model"] == "gemini-embedding-001"


# ──────────────────────────────────────────────────────────────────────────────
# Story 5.02 Tests — delete_platform
# ──────────────────────────────────────────────────────────────────────────────

class TestDeletePlatform:
    """BB + WB tests for Story 5.02."""

    def test_delete_platform_removes_all_for_platform(self, test_collection):
        """BB: Insert hubspot + ghl vectors; delete hubspot; only ghl remains."""
        client = _get_client()

        # Insert 3 hubspot and 2 ghl chunks with distinct IDs
        hs_chunks = [_make_embedded("hubspot_del", i) for i in range(3)]
        ghl_chunks = [_make_embedded("ghl_del", i) for i in range(2)]
        upsert_vectors(hs_chunks + ghl_chunks, collection=test_collection)

        # Delete hubspot_del only
        deleted = delete_platform("hubspot_del", collection=test_collection)
        assert deleted == 3, f"Expected 3 deleted, got {deleted}"

        # ghl_del must still exist
        from qdrant_client.models import Filter, FieldCondition, MatchValue
        remaining = client.count(
            collection_name=test_collection,
            count_filter=Filter(must=[FieldCondition(key="platform", match=MatchValue(value="ghl_del"))]),
            exact=True,
        ).count
        assert remaining == 2, f"Expected 2 ghl_del remaining, got {remaining}"

    def test_delete_customer_scoped(self, test_collection):
        """BB: Delete with customer_id scope only removes that customer's vectors."""
        customer_id = f"cust-{uuid.uuid4().hex[:6]}"
        platform = "xero_scoped"

        # Insert 2 with customer, 2 without
        with_cust = [_make_embedded(platform, i, customer_id=customer_id) for i in range(2)]
        without_cust = [_make_embedded(platform, i + 10) for i in range(2)]
        upsert_vectors(with_cust + without_cust, collection=test_collection)

        # Delete only customer-scoped
        deleted = delete_platform(platform, customer_id=customer_id, collection=test_collection)
        assert deleted == 2, f"Expected 2 deleted (customer-scoped), got {deleted}"

        # Global (no customer_id) entries must remain
        client = _get_client()
        from qdrant_client.models import Filter, FieldCondition, MatchValue
        remaining = client.count(
            collection_name=test_collection,
            count_filter=Filter(must=[FieldCondition(key="platform", match=MatchValue(value=platform))]),
            exact=True,
        ).count
        assert remaining == 2, f"Expected 2 global records remaining, got {remaining}"

    def test_delete_nonexistent_platform(self, test_collection):
        """BB: Deleting an unknown platform returns 0 without error."""
        result = delete_platform("nonexistent_platform_xyz", collection=test_collection)
        assert result == 0


# ──────────────────────────────────────────────────────────────────────────────
# Story 5.03 Tests — search_platform
# ──────────────────────────────────────────────────────────────────────────────

class TestSearchPlatform:
    """BB + WB tests for Story 5.03."""

    def test_scoped_search_only_returns_correct_platform(self, test_collection):
        """BB: Insert hubspot + ghl with SAME query vector; search hubspot → only hubspot."""
        query_vector = _random_vector()

        # Use near-identical vectors so threshold passes
        perturb = lambda v: [x + random.gauss(0, 0.0001) for x in v]

        hs = _make_embedded("search_hs", 1, vector=perturb(query_vector))
        gl = _make_embedded("search_ghl", 1, vector=perturb(query_vector))
        upsert_vectors([hs, gl], collection=test_collection)

        results = search_platform(
            query_vector=query_vector,
            platform="search_hs",
            top_k=10,
            score_threshold=0.0,  # Accept all scores so we detect cross-contamination
            collection=test_collection,
        )

        platforms_returned = {r["platform"] for r in results}
        assert "search_ghl" not in platforms_returned, (
            "search_platform returned ghl results when scoped to search_hs"
        )
        # At least one hubspot result
        assert any(r["platform"] == "search_hs" for r in results), (
            "Expected search_hs results but none found"
        )

    def test_search_returns_correct_fields(self, test_collection):
        """WB: Result dicts contain all required keys."""
        ec = _make_embedded("search_fields_test", 0)
        upsert_vectors([ec], collection=test_collection)

        results = search_platform(
            query_vector=ec.vector,  # Exact match → high score
            platform="search_fields_test",
            top_k=5,
            score_threshold=0.0,
            collection=test_collection,
        )

        assert results, "Expected at least one result"
        r = results[0]
        required_keys = {"id", "score", "title", "text", "source_url", "platform", "heading_context"}
        assert required_keys.issubset(set(r.keys())), (
            f"Missing keys: {required_keys - set(r.keys())}"
        )

    def test_search_score_threshold_filters_low_scores(self, test_collection):
        """WB: Vectors orthogonal to query don't pass a high threshold."""
        query_vector = _random_vector()
        # Insert a very different vector — should score poorly
        orthogonal = _random_vector()
        ec = _make_embedded("threshold_test", 0, vector=orthogonal)
        upsert_vectors([ec], collection=test_collection)

        results = search_platform(
            query_vector=query_vector,
            platform="threshold_test",
            top_k=5,
            score_threshold=0.99,  # Only near-perfect matches
            collection=test_collection,
        )
        # Orthogonal random vectors rarely exceed 0.99 cosine similarity
        for r in results:
            assert r["score"] >= 0.99, f"Result below threshold: {r['score']}"


# ──────────────────────────────────────────────────────────────────────────────
# Story 5.04 Tests — get_platform_stats
# ──────────────────────────────────────────────────────────────────────────────

class TestGetPlatformStats:
    """BB + WB tests for Story 5.04."""

    def test_stats_returns_required_keys(self, test_collection):
        """BB: Stats dict has total, collection, dimension, platforms."""
        stats = get_platform_stats(collection=test_collection)
        assert "total" in stats
        assert "collection" in stats
        assert "dimension" in stats
        assert "platforms" in stats
        assert stats["dimension"] == 3072
        assert stats["collection"] == test_collection

    def test_stats_total_is_accurate(self, test_collection):
        """BB: Stats total matches actual collection count."""
        client = _get_client()
        actual_count = client.count(collection_name=test_collection, exact=True).count

        stats = get_platform_stats(collection=test_collection)
        assert stats["total"] == actual_count, (
            f"Stats total {stats['total']} != actual {actual_count}"
        )

    def test_stats_platforms_only_nonzero(self, test_collection):
        """WB: Platforms dict only contains entries with count > 0."""
        stats = get_platform_stats(collection=test_collection)
        for platform, count in stats["platforms"].items():
            assert count > 0, f"Platform {platform} in stats with count 0"


# ──────────────────────────────────────────────────────────────────────────────
# Story 5.05 — Full Lifecycle Integration Test
# ──────────────────────────────────────────────────────────────────────────────

class TestFullLifecycle:
    """End-to-end lifecycle: insert → search → find → delete → search → empty."""

    def test_full_lifecycle(self, test_collection):
        """
        BB full lifecycle test:
        1. Insert 3 hubspot chunks with a known query vector
        2. Search → finds them
        3. Delete platform
        4. Search → empty
        """
        platform = f"lifecycle_{uuid.uuid4().hex[:6]}"
        query_vector = _random_vector()

        # Step 1: Insert
        perturb = lambda v: [x + random.gauss(0, 0.00001) for x in v]
        chunks = [
            _make_embedded(platform, i, vector=perturb(query_vector))
            for i in range(3)
        ]
        inserted = upsert_vectors(chunks, collection=test_collection)
        assert inserted == 3, f"Expected 3 inserted, got {inserted}"

        # Step 2: Search — should find
        results = search_platform(
            query_vector=query_vector,
            platform=platform,
            top_k=10,
            score_threshold=0.0,
            collection=test_collection,
        )
        assert len(results) == 3, f"Expected 3 results after insert, got {len(results)}"

        # Step 3: Delete
        deleted = delete_platform(platform, collection=test_collection)
        assert deleted == 3, f"Expected 3 deleted, got {deleted}"

        # Step 4: Search after delete — should be empty
        results_after = search_platform(
            query_vector=query_vector,
            platform=platform,
            top_k=10,
            score_threshold=0.0,
            collection=test_collection,
        )
        assert len(results_after) == 0, (
            f"Expected 0 results after delete, got {len(results_after)}"
        )


# VERIFICATION_STAMP
# Story: 5.05 — Qdrant Store Integration Tests
# Verified By: parallel-builder (claude-sonnet-4-6)
# Verified At: 2026-02-26
# Tests: 11 test methods across 5 test classes
# Coverage: Stories 5.01–5.04 fully covered (BB + WB)