#!/usr/bin/env python3
"""
Story 4.05 — Embedder Integration Test Suite
=============================================
Tests for MODULE 4: Gemini Embedder

Black-box and white-box tests covering:
  - embed_text()           (4.01)
  - embed_batch()          (4.02)
  - embed_with_cache()     (4.03)
  - build_embedding_text() (4.04)

Tests that call the real Gemini API are skipped when GEMINI_API_KEY is not set.
All unit tests use mocking and fakeredis — no API quota consumed.
"""

import hashlib
import json
import os
import sys
import time
from typing import List
from unittest.mock import MagicMock, patch

import fakeredis
import pytest

# Ensure project root on sys.path
sys.path.insert(0, "/mnt/e/genesis-system")

from core.kb.contracts import Chunk, EmbeddedChunk
import core.kb.embedder as embedder_module
from core.kb.embedder import (
    EMBED_MODEL,
    MAX_EMBEDDING_CHARS,
    VECTOR_DIM,
    build_embedding_text,
    embed_batch,
    embed_text,
    embed_with_cache,
)

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

GEMINI_API_KEY_PRESENT = bool(os.getenv("GEMINI_API_KEY", ""))

_FAKE_VECTOR: List[float] = [0.01] * VECTOR_DIM


def _make_chunk(
    text: str = "Sample chunk text.",
    platform: str = "hubspot",
    heading_context: str = "Getting Started > Installation",
    title: str = "Quick Start Guide",
    chunk_id: str = "chunk_001",
    customer_id: str = None,
) -> Chunk:
    return Chunk(
        chunk_id=chunk_id,
        source_url="https://example.com/docs",
        platform=platform,
        customer_id=customer_id,
        title=title,
        text=text,
        heading_context=heading_context,
        chunk_index=0,
        total_chunks=1,
        metadata={},
    )


def mock_embed_text(text: str) -> List[float]:
    """Fake embed_text that returns a deterministic 3072-dim vector."""
    return _FAKE_VECTOR


# ===========================================================================
# Story 4.01 — Single Text Embedder (unit + live)
# ===========================================================================

class TestEmbedText:
    """Black-box and white-box tests for embed_text()."""

    # BB: Correct model constant
    def test_correct_model_used(self):
        """Verify the model constant is exactly 'gemini-embedding-001'."""
        assert EMBED_MODEL == "gemini-embedding-001"

    # BB: API key is loadable
    def test_api_key_loaded(self):
        """API key should be non-empty (loaded from env or secrets.env)."""
        from core.kb.embedder import _load_api_key
        key = _load_api_key()
        assert key, "GEMINI_API_KEY must be loadable from env or secrets.env"
        assert len(key) > 10, "API key appears too short to be valid"

    # BB: Live API — embed 'hello world' → 3072-dim vector
    @pytest.mark.skipif(not GEMINI_API_KEY_PRESENT, reason="GEMINI_API_KEY not set — skipping live API test")
    def test_embed_returns_3072(self):
        """embed_text('hello world') → list of 3072 floats."""
        # Reset cached client to use real API
        embedder_module._genai_client = None
        vector = embed_text("hello world")
        assert isinstance(vector, list)
        assert len(vector) == VECTOR_DIM
        assert all(isinstance(v, float) for v in vector)

    # BB: Live API — long text (10k chars) → valid vector
    @pytest.mark.skipif(not GEMINI_API_KEY_PRESENT, reason="GEMINI_API_KEY not set — skipping live API test")
    def test_embed_long_text(self):
        """10,000 char text → valid 3072-dim vector (API truncates internally)."""
        embedder_module._genai_client = None
        long_text = "Genesis knowledge base content. " * 313  # ~10,000 chars
        vector = embed_text(long_text[:10_000])
        assert len(vector) == VECTOR_DIM

    # BB: Empty string raises ValueError
    def test_embed_empty_string(self):
        """Empty string must raise ValueError, not a silent zero vector."""
        with pytest.raises(ValueError, match="empty"):
            embed_text("")

    # WB: Whitespace-only string raises ValueError
    def test_embed_whitespace_only(self):
        """Whitespace-only string must raise ValueError."""
        with pytest.raises(ValueError, match="empty"):
            embed_text("   \n\t  ")

    # WB: Mocked API returns vector of correct dimension
    def test_embed_returns_correct_dim_mocked(self):
        """Mocked embed returns exactly VECTOR_DIM floats."""
        mock_result = MagicMock()
        mock_result.embeddings[0].values = _FAKE_VECTOR

        with patch.object(embedder_module, "_get_genai_client") as mock_client:
            client_instance = MagicMock()
            client_instance.models.embed_content.return_value = mock_result
            mock_client.return_value = client_instance

            vector = embed_text("test content")

        assert len(vector) == VECTOR_DIM
        assert all(isinstance(v, float) for v in vector)


# ===========================================================================
# Story 4.04 — Build Embedding Text Optimizer
# ===========================================================================

class TestBuildEmbeddingText:
    """Black-box and white-box tests for build_embedding_text()."""

    # BB: heading_context appears in output
    def test_build_embedding_text_contains_heading(self):
        """heading_context must appear in the embedding text."""
        chunk = _make_chunk(heading_context="API Reference > Authentication")
        result = build_embedding_text(chunk)
        assert "API Reference > Authentication" in result

    # BB: platform tag appears at start
    def test_build_embedding_text_contains_platform(self):
        """Platform tag '[hubspot]' must appear at the very start of the text."""
        chunk = _make_chunk(platform="hubspot")
        result = build_embedding_text(chunk)
        assert result.startswith("[hubspot]")

    # BB: 20k chars → truncated to ≤ 8000
    def test_build_embedding_text_truncation(self):
        """20,000-char text must be truncated to ≤ MAX_EMBEDDING_CHARS."""
        long_text = "x" * 20_000
        chunk = _make_chunk(text=long_text)
        result = build_embedding_text(chunk)
        assert len(result) <= MAX_EMBEDDING_CHARS

    # BB: whitespace normalisation — multiple spaces → single space
    def test_whitespace_normalization(self):
        """Multiple consecutive spaces must be collapsed to one space."""
        chunk = _make_chunk(text="foo    bar    baz")
        result = build_embedding_text(chunk)
        assert "foo bar baz" in result
        assert "  " not in result

    # BB: empty heading handled gracefully (no bracket tag)
    def test_empty_heading_handled(self):
        """Empty heading_context must produce valid text without empty brackets."""
        chunk = _make_chunk(heading_context="")
        result = build_embedding_text(chunk)
        assert "[]" not in result
        assert chunk.title in result

    # WB: title is present in output
    def test_title_in_output(self):
        chunk = _make_chunk(title="My Title", text="Some content")
        result = build_embedding_text(chunk)
        assert "My Title" in result

    # WB: text is present in output (before truncation)
    def test_text_in_output(self):
        chunk = _make_chunk(text="Unique content here")
        result = build_embedding_text(chunk)
        assert "Unique content here" in result

    # WB: multiple trailing newlines are collapsed
    def test_excessive_newlines_collapsed(self):
        chunk = _make_chunk(text="line1\n\n\n\n\nline2")
        result = build_embedding_text(chunk)
        # Should not have 3+ consecutive newlines
        assert "\n\n\n" not in result


# ===========================================================================
# Story 4.02 — Batch Embedder
# ===========================================================================

class TestEmbedBatch:
    """Tests for embed_batch() using mocked embed_text."""

    def _make_chunks(self, n: int) -> List[Chunk]:
        return [_make_chunk(text=f"Chunk content {i}", chunk_id=f"chunk_{i:03d}") for i in range(n)]

    # BB: embed_batch produces EmbeddedChunk objects
    def test_batch_embed_creates_embedded_chunks(self):
        """embed_batch of 5 chunks → 5 EmbeddedChunk objects."""
        chunks = self._make_chunks(5)
        with patch.object(embedder_module, "embed_text", side_effect=mock_embed_text):
            result = embed_batch(chunks, batch_size=10)
        assert len(result) == 5
        assert all(isinstance(ec, EmbeddedChunk) for ec in result)

    # BB: all vectors are exactly VECTOR_DIM
    def test_all_vectors_same_dim(self):
        """All EmbeddedChunk vectors must be exactly 3072-dim."""
        chunks = self._make_chunks(5)
        with patch.object(embedder_module, "embed_text", side_effect=mock_embed_text):
            result = embed_batch(chunks, batch_size=10)
        for ec in result:
            assert len(ec.vector) == VECTOR_DIM

    # BB: embedding_model field is set correctly
    def test_embedded_chunk_has_model(self):
        """Each EmbeddedChunk must have embedding_model == 'gemini-embedding-001'."""
        chunks = self._make_chunks(3)
        with patch.object(embedder_module, "embed_text", side_effect=mock_embed_text):
            result = embed_batch(chunks)
        for ec in result:
            assert ec.embedding_model == "gemini-embedding-001"

    # WB: the original Chunk is preserved on EmbeddedChunk
    def test_chunk_preserved_on_embedded(self):
        chunks = self._make_chunks(2)
        with patch.object(embedder_module, "embed_text", side_effect=mock_embed_text):
            result = embed_batch(chunks)
        for i, ec in enumerate(result):
            assert ec.chunk.chunk_id == f"chunk_{i:03d}"

    # WB: empty batch returns empty list
    def test_empty_batch(self):
        result = embed_batch([])
        assert result == []


# ===========================================================================
# Story 4.03 — Embedding Cache (Redis)
# ===========================================================================

class TestEmbedWithCache:
    """Tests for embed_with_cache() using fakeredis."""

    def _make_fake_redis(self):
        """Return a fakeredis server instance for patching."""
        return fakeredis.FakeRedis()

    # BB: cache miss calls embed_text
    def test_cache_miss_calls_api(self):
        """On a cache miss, embed_text must be called once."""
        server = fakeredis.FakeServer()
        fake_redis = fakeredis.FakeRedis(server=server)

        with patch("redis.from_url", return_value=fake_redis), \
             patch.object(embedder_module, "embed_text", side_effect=mock_embed_text) as mock_embed:
            result = embed_with_cache("brand new text")

        mock_embed.assert_called_once_with("brand new text")
        assert len(result) == VECTOR_DIM

    # BB: cache hit skips embed_text
    def test_cache_hit_skips_api(self):
        """On a cache hit, embed_text must NOT be called."""
        server = fakeredis.FakeServer()
        fake_redis = fakeredis.FakeRedis(server=server)

        text = "cached content here"
        cache_key = hashlib.sha256(text.encode()).hexdigest()
        full_key = f"genesis:kb:embed:{cache_key}"
        fake_redis.setex(full_key, 3600, json.dumps(_FAKE_VECTOR))

        with patch("redis.from_url", return_value=fake_redis), \
             patch.object(embedder_module, "embed_text", side_effect=mock_embed_text) as mock_embed:
            result = embed_with_cache(text)

        mock_embed.assert_not_called()
        assert result == _FAKE_VECTOR

    # BB: Redis unavailable → still embeds (no crash)
    def test_redis_unavailable_still_embeds(self):
        """When Redis is completely unreachable, embed_with_cache must still return a vector."""
        broken_redis = MagicMock()
        broken_redis.ping.side_effect = ConnectionError("Redis down")

        with patch("redis.from_url", return_value=broken_redis), \
             patch.object(embedder_module, "embed_text", side_effect=mock_embed_text):
            result = embed_with_cache("fallback text")

        assert len(result) == VECTOR_DIM

    # WB: custom cache_key is used (not SHA-256 of text)
    def test_custom_cache_key_used(self):
        """When a custom cache_key is provided, it must be used as the key."""
        server = fakeredis.FakeServer()
        fake_redis = fakeredis.FakeRedis(server=server)

        custom_key = "my_custom_key_xyz"
        full_key = f"genesis:kb:embed:{custom_key}"
        fake_redis.setex(full_key, 3600, json.dumps(_FAKE_VECTOR))

        with patch("redis.from_url", return_value=fake_redis), \
             patch.object(embedder_module, "embed_text", side_effect=mock_embed_text) as mock_embed:
            result = embed_with_cache("any text", cache_key=custom_key)

        mock_embed.assert_not_called()
        assert result == _FAKE_VECTOR

    # WB: result is stored in cache after a miss
    def test_cache_stored_after_miss(self):
        """After a cache miss, the vector must be stored in Redis."""
        server = fakeredis.FakeServer()
        fake_redis = fakeredis.FakeRedis(server=server)

        text = "store me please"
        with patch("redis.from_url", return_value=fake_redis), \
             patch.object(embedder_module, "embed_text", side_effect=mock_embed_text):
            embed_with_cache(text)

        cache_key = hashlib.sha256(text.encode()).hexdigest()
        full_key = f"genesis:kb:embed:{cache_key}"
        stored = fake_redis.get(full_key)
        assert stored is not None
        assert json.loads(stored) == _FAKE_VECTOR
