#!/usr/bin/env python3
"""
Story 3.05 — Chunker Integration Tests
=======================================
17 tests covering Stories 3.01–3.04.

Run:
    cd /mnt/e/genesis-system
    python3 -m pytest tests/kb/test_m3_chunker_integration.py -v
"""

import re
from datetime import datetime

import pytest

from core.kb.contracts import Chunk, ExtractedContent, PlatformConfig
from core.kb.chunker import (
    chunk_batch,
    chunk_text,
    chunk_with_headings,
    tag_chunks,
)


# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────

def _make_content(
    text: str,
    url: str = "https://example.com/page",
    title: str = "Test Page",
) -> ExtractedContent:
    return ExtractedContent(
        url=url,
        title=title,
        text=text,
        headings=[],
        code_blocks=[],
        tables=[],
        metadata={},
    )


def _make_config(
    chunk_size: int = 1500,
    chunk_overlap: int = 200,
    name: str = "hubspot",
) -> PlatformConfig:
    return PlatformConfig(
        name=name,
        display_name="HubSpot",
        docs_base_url="https://knowledge.hubspot.com",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )


def _long_prose(n_chars: int) -> str:
    """Return prose with natural sentence endings of total length ≥ n_chars."""
    sentence = "This is a sample sentence with enough words. "
    repetitions = (n_chars // len(sentence)) + 2
    return (sentence * repetitions)[:n_chars]


# ─────────────────────────────────────────────────────────────────────────────
# 3.01 — chunk_text tests
# ─────────────────────────────────────────────────────────────────────────────

def test_chunk_short_text():
    """A 100-character text produces exactly 1 chunk."""
    text = "Hello world. This is a short piece of text." * 2  # ~86 chars
    text = text[:100]
    chunks = chunk_text(text, chunk_size=1500, overlap=200)
    assert len(chunks) == 1
    assert chunks[0].strip() != ""


def test_chunk_long_text():
    """A 5 000-char text with chunk_size=1500 produces multiple chunks."""
    text = _long_prose(5000)
    chunks = chunk_text(text, chunk_size=1500, overlap=200)
    assert len(chunks) > 1
    # Every chunk must be non-empty.
    for c in chunks:
        assert c.strip(), "Empty chunk found"


def test_chunk_sentence_boundary():
    """Chunks end at sentence boundaries, not mid-word."""
    # Build text where sentences are clearly demarcated.
    sentences = [f"Sentence number {i} ends right here." for i in range(50)]
    text = " ".join(sentences)
    chunks = chunk_text(text, chunk_size=300, overlap=50)
    for chunk in chunks:
        # Each chunk should not cut a word in two — every chunk word appears in
        # original text.
        chunk_words = chunk.split()
        for word in chunk_words:
            assert word in text, f"Word '{word}' not in original text"


def test_overlap_content():
    """Consecutive chunks share overlapping text."""
    text = _long_prose(4000)
    chunks = chunk_text(text, chunk_size=1000, overlap=200)
    # At least one pair of consecutive chunks must share some words.
    assert len(chunks) >= 2
    found_overlap = False
    for i in range(len(chunks) - 1):
        words_a = set(chunks[i].split())
        words_b = set(chunks[i + 1].split())
        if words_a & words_b:
            found_overlap = True
            break
    assert found_overlap, "No overlapping words found between consecutive chunks"


def test_chunk_size_respected():
    """No chunk (from chunk_text) exceeds chunk_size characters."""
    text = _long_prose(6000)
    chunk_size = 800
    chunks = chunk_text(text, chunk_size=chunk_size, overlap=100)
    for idx, c in enumerate(chunks):
        assert len(c) <= chunk_size, (
            f"Chunk {idx} is {len(c)} chars, exceeds limit {chunk_size}"
        )


# ─────────────────────────────────────────────────────────────────────────────
# 3.02 — chunk_with_headings tests
# ─────────────────────────────────────────────────────────────────────────────

def test_heading_context_preserved():
    """Chunks under a heading have heading_context populated."""
    text = (
        "# Getting Started\n\n"
        + _long_prose(3000)
        + "\n\n## Installation\n\n"
        + _long_prose(1000)
    )
    content = _make_content(text)
    chunks = chunk_with_headings(content, platform="hubspot")
    # At least some chunks should have a non-empty heading_context.
    contexts = [c.heading_context for c in chunks]
    assert any(ctx for ctx in contexts), "No heading_context found in any chunk"
    # Chunks under '## Installation' should mention it.
    install_chunks = [c for c in chunks if "Installation" in c.heading_context]
    assert install_chunks, "No chunks found with 'Installation' in heading_context"


def test_code_block_intact():
    """A code block smaller than chunk_size is kept together in one chunk."""
    code = "```python\ndef hello():\n    return 'world'\n```"
    text = "Some prose before. " + code + " Some prose after."
    content = _make_content(text)
    chunks = chunk_with_headings(content, platform="hubspot", chunk_size=1500)
    # The code block must appear as a whole in exactly one chunk.
    matching = [c for c in chunks if "```python" in c.text and "```" in c.text]
    assert matching, "Code block not found intact in any chunk"
    for m in matching:
        # The block must open and close.
        assert m.text.count("```") >= 2


def test_deterministic_ids():
    """Same content always produces the same chunk IDs."""
    text = _long_prose(3000) + "\n\nMore content here."
    content = _make_content(text)
    chunks_a = chunk_with_headings(content, platform="hubspot")
    chunks_b = chunk_with_headings(content, platform="hubspot")
    ids_a = [c.chunk_id for c in chunks_a]
    ids_b = [c.chunk_id for c in chunks_b]
    assert ids_a == ids_b, "chunk_ids are not deterministic"


# ─────────────────────────────────────────────────────────────────────────────
# 3.03 — tag_chunks tests
# ─────────────────────────────────────────────────────────────────────────────

def test_platform_tag():
    """All chunks returned by tag_chunks have the correct platform."""
    text = _long_prose(3000)
    content = _make_content(text)
    chunks = chunk_with_headings(content, platform="raw")
    tagged = tag_chunks(chunks, platform="ghl")
    for c in tagged:
        assert c.platform == "ghl", f"Expected platform='ghl', got '{c.platform}'"


def test_customer_id_isolation():
    """customer_id is set on all chunks when provided."""
    text = _long_prose(2000)
    content = _make_content(text)
    chunks = chunk_with_headings(content, platform="hubspot")
    cid = "customer-abc-123"
    tagged = tag_chunks(chunks, platform="hubspot", customer_id=cid)
    for c in tagged:
        assert c.customer_id == cid


def test_global_kb():
    """customer_id=None represents shared/global knowledge base."""
    text = _long_prose(2000)
    content = _make_content(text)
    chunks = chunk_with_headings(content, platform="hubspot")
    tagged = tag_chunks(chunks, platform="hubspot", customer_id=None)
    for c in tagged:
        assert c.customer_id is None


def test_metadata_timestamp():
    """ingested_at is present in metadata and is a valid ISO 8601 timestamp."""
    text = _long_prose(1500)
    content = _make_content(text)
    chunks = chunk_with_headings(content, platform="hubspot")
    tagged = tag_chunks(chunks, platform="hubspot")
    for c in tagged:
        assert "ingested_at" in c.metadata, "ingested_at missing from metadata"
        ts = c.metadata["ingested_at"]
        # Must parse as ISO 8601 (e.g., "2026-02-26T12:34:56Z").
        datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")


# ─────────────────────────────────────────────────────────────────────────────
# 3.04 — chunk_batch tests
# ─────────────────────────────────────────────────────────────────────────────

def test_batch_3_contents():
    """chunk_batch with 3 non-empty contents returns N total chunks (N > 0)."""
    contents = [
        _make_content(_long_prose(2000), url=f"https://example.com/page{i}")
        for i in range(3)
    ]
    config = _make_config()
    chunks = chunk_batch(contents, config)
    assert len(chunks) > 0, "Expected at least one chunk from 3 contents"


def test_batch_with_empty():
    """chunk_batch with 2 good + 1 empty content returns chunks from 2 only."""
    good1 = _make_content(_long_prose(2000), url="https://example.com/g1")
    empty = _make_content("", url="https://example.com/empty")
    good2 = _make_content(_long_prose(1500), url="https://example.com/g2")
    config = _make_config()
    chunks = chunk_batch([good1, empty, good2], config)
    source_urls = {c.source_url for c in chunks}
    assert "https://example.com/empty" not in source_urls, (
        "Empty content should not produce any chunks"
    )
    assert "https://example.com/g1" in source_urls
    assert "https://example.com/g2" in source_urls


# ─────────────────────────────────────────────────────────────────────────────
# Edge-case / invariant tests
# ─────────────────────────────────────────────────────────────────────────────

def test_no_empty_chunks():
    """Every chunk produced has non-empty text."""
    text = (
        "# Section One\n\n"
        + _long_prose(4000)
        + "\n\n## Section Two\n\n"
        + _long_prose(2000)
    )
    content = _make_content(text)
    chunks = chunk_with_headings(content, platform="hubspot", chunk_size=800, overlap=100)
    for idx, c in enumerate(chunks):
        assert c.text.strip(), f"Chunk {idx} has empty text"


def test_chunk_reassembly():
    """
    Reassembled text from chunks approximates the original.

    Because of overlap, the original text is contained *within* the
    concatenation of all chunk texts — we verify that all non-overlapping
    words from the original appear in at least one chunk.
    """
    original = _long_prose(3000)
    content = _make_content(original)
    chunks = chunk_with_headings(content, platform="hubspot", chunk_size=800, overlap=100)
    combined = " ".join(c.text for c in chunks)
    # Every word from the original should appear in the combined text.
    original_words = set(original.split())
    combined_words = set(combined.split())
    missing = original_words - combined_words
    # Allow a small tolerance for whitespace normalisation at boundaries.
    assert len(missing) / max(len(original_words), 1) < 0.05, (
        f"{len(missing)} words from original not found in reassembled text"
    )


def test_idempotent_chunking():
    """Same input always produces the same chunks with identical IDs."""
    text = "# Heading One\n\n" + _long_prose(3000) + "\n\n## Heading Two\n\n" + _long_prose(1500)
    content = _make_content(text)

    run1 = chunk_with_headings(content, platform="hubspot")
    run2 = chunk_with_headings(content, platform="hubspot")

    assert len(run1) == len(run2), "Different number of chunks across runs"
    for a, b in zip(run1, run2):
        assert a.chunk_id == b.chunk_id, f"chunk_id mismatch: {a.chunk_id} != {b.chunk_id}"
        assert a.text == b.text, "chunk text differs across runs"