#!/usr/bin/env python3
"""
Story 8.06 — Orchestrator Integration Tests
=============================================
All external calls (HTTP, Gemini API, Qdrant, PostgreSQL) are fully mocked.
No real network traffic, API keys, or database connections required.

Test summary (28 tests):

Class TestIngestPlatform (6 tests — black-box):
  BB  test_full_pipeline_10_pages          — 10 URLs → all steps called in order
  BB  test_max_pages_limits                — max_pages=5 → only 5 processed
  BB  test_force_refresh_skips_hash        — force_refresh=True → no get_content_hashes
  BB  test_unknown_platform_error          — Unknown platform → raises ValueError
  BB  test_skips_unchanged_pages           — 3 of 10 unchanged → only 7 fetched
  BB  test_stats_returned                  — Stats dict has all required keys

Class TestIngestUrl (3 tests — black-box):
  BB  test_single_url_pipeline             — 1 URL → full pipeline → stats
  BB  test_invalid_url_error              — non-200 → stats show error
  BB  test_customer_id_passed              — customer_id flows through to tagged chunks

Class TestCli (4 tests — black-box):
  BB  test_cli_list                        — 'list' → shows registered platforms
  BB  test_cli_ingest                      — 'ingest telnyx --max-pages 3' → calls ingest_platform
  BB  test_cli_status                      — 'status telnyx' → shows ingestion history
  BB  test_cli_unknown_command             — Unknown subcommand → error message + exit 1

Class TestProgressReporting (3 tests — black-box + white-box):
  BB  test_progress_format                 — Step name + percentage appears in stderr output
  BB  test_progress_callback               — Callback receives (step, current, total, message)
  WB  test_zero_total_safe                 — 0 total → no ZeroDivisionError

Class TestErrorRecovery (5 tests — black-box + white-box):
  BB  test_partial_fetch_failure           — Some pages 404 → others continue
  BB  test_embed_failure_continues         — Embed raises on 1 chunk → others embedded
  BB  test_errors_in_stats                 — stats["errors"] and error_details populated
  WB  test_pg_upsert_failure_logged        — PG upsert fails gracefully, pipeline continues
  WB  test_qdrant_upsert_failure_logged    — Qdrant upsert fails gracefully, pipeline continues

Class TestWhiteBox (7 tests — white-box):
  WB  test_finalize_stats_stamps_duration  — duration_seconds > 0 after pipeline
  WB  test_finalize_stats_status_default   — status defaults to 'completed'
  WB  test_report_progress_stderr         — output goes to stderr not stdout
  WB  test_stats_has_error_details_list   — error_details is always a list
  WB  test_ingest_url_returns_dict        — ingest_url returns dict with correct keys
  WB  test_platform_name_normalised        — platform name in stats matches config.name
  WB  test_log_complete_called             — log_ingestion_complete called after pipeline
"""

from __future__ import annotations

import asyncio
import io
import json
import sys
from typing import Any, Dict, List, Optional
from unittest.mock import AsyncMock, MagicMock, patch, call

import pytest

# ── Ensure project root on path ───────────────────────────────────────────────
_PROJECT_ROOT = "/mnt/e/genesis-system"
if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)

from core.kb.contracts import (
    Chunk,
    EmbeddedChunk,
    ExtractedContent,
    FetchedPage,
    PlatformConfig,
)
from core.kb.orchestrator import (
    _build_parser,
    _cli_list,
    _cli_status,
    _finalize_stats,
    _report_progress,
    ingest_platform,
    ingest_url,
)

# ──────────────────────────────────────────────────────────────────────────────
# Factories and shared fixtures
# ──────────────────────────────────────────────────────────────────────────────

def _make_fetched_page(url: str, status: int = 200, html: str = "<html><body>hello</body></html>") -> FetchedPage:
    return FetchedPage(
        url=url,
        html=html,
        status_code=status,
        content_type="text/html",
        headers={},
        fetched_at="2026-02-26T00:00:00Z",
    )


def _make_extracted(url: str) -> ExtractedContent:
    return ExtractedContent(
        url=url,
        title="Test Page",
        text="This is some test content for the KB pipeline. " * 10,
        headings=["# Test Page"],
        code_blocks=[],
        tables=[],
        metadata={"status_code": 200},
    )


def _make_chunk(url: str, idx: int = 0) -> Chunk:
    return Chunk(
        chunk_id=f"chunk_{idx}_{url[-8:]}",
        source_url=url,
        platform="telnyx",
        customer_id=None,
        title="Test Page",
        text="Test content chunk.",
        heading_context="# Test Page",
        chunk_index=idx,
        total_chunks=1,
        metadata={},
    )


def _make_embedded(url: str, idx: int = 0) -> EmbeddedChunk:
    return EmbeddedChunk(
        chunk=_make_chunk(url, idx),
        vector=[0.1] * 3072,
        embedding_model="gemini-embedding-001",
    )


def _make_pages(n: int, base: str = "https://docs.telnyx.com/page") -> List[FetchedPage]:
    return [_make_fetched_page(f"{base}{i}") for i in range(n)]


def _make_extracted_list(pages: List[FetchedPage]) -> List[Optional[ExtractedContent]]:
    return [_make_extracted(p.url) for p in pages]


def _make_embedded_list(pages: List[FetchedPage]) -> List[EmbeddedChunk]:
    return [_make_embedded(p.url, i) for i, p in enumerate(pages)]


# ──────────────────────────────────────────────────────────────────────────────
# Patch targets — all external calls are patched at the orchestrator's import path
# ──────────────────────────────────────────────────────────────────────────────

_PATCH = {
    "fetch_sitemap":         "core.kb.orchestrator.fetch_sitemap",
    "fetch_pages":           "core.kb.orchestrator.fetch_pages",
    "filter_urls":           "core.kb.orchestrator.filter_urls",
    "filter_unchanged":      "core.kb.orchestrator.filter_unchanged",
    "extract_batch":         "core.kb.orchestrator.extract_batch",
    "chunk_batch":           "core.kb.orchestrator.chunk_batch",
    "tag_chunks":            "core.kb.orchestrator.tag_chunks",
    "embed_batch":           "core.kb.orchestrator.embed_batch",
    "upsert_vectors":        "core.kb.orchestrator.upsert_vectors",
    "ensure_schema":         "core.kb.orchestrator.ensure_schema",
    "get_connection":        "core.kb.orchestrator.get_connection",
    "upsert_page":           "core.kb.orchestrator.upsert_page",
    "upsert_pages_batch":    "core.kb.orchestrator.upsert_pages_batch",
    "log_ingestion_start":   "core.kb.orchestrator.log_ingestion_start",
    "log_ingestion_complete":"core.kb.orchestrator.log_ingestion_complete",
    "get_content_hashes":    "core.kb.orchestrator.get_content_hashes",
    "get_ingestion_history": "core.kb.orchestrator.get_ingestion_history",
    "compute_content_hash":  "core.kb.orchestrator.compute_content_hash",
    "get_platform":          "core.kb.orchestrator.get_platform",
    "list_platforms":        "core.kb.orchestrator.list_platforms",
}

# A minimal platform config reused across many tests
_TELNYX_CONFIG = PlatformConfig(
    name="telnyx",
    display_name="Telnyx",
    docs_base_url="https://developers.telnyx.com",
    sitemap_url="https://developers.telnyx.com/sitemap.xml",
    url_patterns=["https://developers.telnyx.com/*"],
    exclude_patterns=[],
    chunk_size=1500,
    chunk_overlap=200,
    max_pages=5000,
    refresh_hours=168,
)


def _mock_conn():
    """Return a mock psycopg2 connection."""
    conn = MagicMock()
    conn.close = MagicMock()
    return conn


def _all_mocks(n_pages: int = 10, known_hashes: Dict = None):
    """
    Return a context-manager dict that patches ALL external dependencies.
    Provides sane defaults for a 10-page ingestion.
    """
    pages = _make_pages(n_pages)
    extracted = _make_extracted_list(pages)
    embedded = _make_embedded_list(pages)

    if known_hashes is None:
        known_hashes = {}

    chunks = [_make_chunk(p.url, i) for i, p in enumerate(pages)]

    return {
        "fetch_sitemap":         AsyncMock(return_value=[p.url for p in pages]),
        "fetch_pages":           AsyncMock(return_value=pages),
        "filter_urls":           MagicMock(side_effect=lambda urls, inc, exc: urls),
        "filter_unchanged":      AsyncMock(return_value=pages),
        "extract_batch":         MagicMock(return_value=extracted),
        "chunk_batch":           MagicMock(return_value=chunks),
        "tag_chunks":            MagicMock(return_value=chunks),
        "embed_batch":           MagicMock(return_value=[_make_embedded(pages[0].url)]),
        "upsert_vectors":        MagicMock(return_value=len(pages)),
        "ensure_schema":         MagicMock(),
        "get_connection":        MagicMock(return_value=_mock_conn()),
        "upsert_page":           MagicMock(return_value=1),
        "upsert_pages_batch":    MagicMock(return_value=10),
        "log_ingestion_start":   MagicMock(return_value=42),
        "log_ingestion_complete":MagicMock(),
        "get_content_hashes":    MagicMock(return_value=known_hashes),
        "get_ingestion_history": MagicMock(return_value=[]),
        "compute_content_hash":  MagicMock(return_value="abc123"),
        "get_platform":          MagicMock(return_value=_TELNYX_CONFIG),
        "list_platforms":        MagicMock(return_value=["hubspot", "telnyx"]),
    }


# ──────────────────────────────────────────────────────────────────────────────
# Class TestIngestPlatform
# ──────────────────────────────────────────────────────────────────────────────

class TestIngestPlatform:
    """BB tests for ingest_platform() — all external calls mocked."""

    def test_full_pipeline_10_pages(self):
        """BB — 10 URLs: verify all pipeline steps are called."""
        mocks = _all_mocks(n_pages=10)

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        # Pipeline steps were all invoked
        mocks["get_platform"].assert_called_once()
        mocks["fetch_sitemap"].assert_called_once()
        mocks["fetch_pages"].assert_called_once()
        mocks["extract_batch"].assert_called_once()
        mocks["chunk_batch"].assert_called_once()
        mocks["tag_chunks"].assert_called_once()
        mocks["embed_batch"].assert_called()
        mocks["upsert_vectors"].assert_called()
        mocks["log_ingestion_start"].assert_called_once()
        mocks["log_ingestion_complete"].assert_called_once()

        # Stats returned
        assert isinstance(stats, dict)
        assert stats["platform"] == "telnyx"

    def test_max_pages_limits(self):
        """BB — max_pages=5 caps URLs at 5 before fetching."""
        # Return 20 sitemap URLs but request max_pages=5
        pages_20 = _make_pages(20)
        mocks = _all_mocks(n_pages=20)
        # fetch_pages should only receive 5 URLs
        mocks["fetch_pages"] = AsyncMock(return_value=pages_20[:5])

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], AsyncMock(return_value=[p.url for p in pages_20])), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], AsyncMock(return_value=pages_20[:5])), \
             patch(_PATCH["extract_batch"], MagicMock(return_value=_make_extracted_list(pages_20[:5]))), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            asyncio.run(ingest_platform("telnyx", max_pages=5))

        # fetch_pages should receive at most 5 URLs
        call_args = mocks["fetch_pages"].call_args
        urls_passed = call_args[0][0]  # positional arg 0 = urls list
        assert len(urls_passed) == 5

    def test_force_refresh_skips_hash(self):
        """BB — force_refresh=True means get_content_hashes is NOT called."""
        mocks = _all_mocks()

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            asyncio.run(ingest_platform("telnyx", force_refresh=True))

        # With force_refresh, get_content_hashes must NOT be called
        mocks["get_content_hashes"].assert_not_called()

    def test_unknown_platform_error(self):
        """BB — Unknown platform name raises ValueError."""
        with patch(_PATCH["get_platform"], MagicMock(return_value=None)), \
             patch(_PATCH["list_platforms"], MagicMock(return_value=["telnyx"])):
            with pytest.raises(ValueError, match="Unknown platform"):
                asyncio.run(ingest_platform("nonexistent_xyz"))

    def test_skips_unchanged_pages(self):
        """BB — 3 of 10 pages unchanged → filter_unchanged returns 7."""
        pages_10 = _make_pages(10)
        pages_7 = pages_10[:7]   # only 7 changed

        # Simulate known_hashes for 3 pages
        known_hashes = {pages_10[7].url: "hash7",
                        pages_10[8].url: "hash8",
                        pages_10[9].url: "hash9"}
        mocks = _all_mocks(known_hashes=known_hashes)

        # Pre-filter skips 3 known URLs; fetch only receives 7 URLs
        # and returns the 7 corresponding pages (none unchanged at this point)
        mocks["filter_unchanged"] = AsyncMock(return_value=pages_7)
        mocks["fetch_pages"] = AsyncMock(return_value=pages_7)
        mocks["extract_batch"] = MagicMock(return_value=_make_extracted_list(pages_7))

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], AsyncMock(return_value=[p.url for p in pages_10])), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        assert stats["pages_skipped"] == 3

    def test_stats_returned(self):
        """BB — Stats dict contains all required keys."""
        mocks = _all_mocks()

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        required_keys = {
            "platform", "pages_fetched", "pages_skipped",
            "chunks_created", "vectors_upserted",
            "errors", "error_details", "duration_seconds", "status",
        }
        assert required_keys.issubset(set(stats.keys())), (
            f"Missing keys: {required_keys - set(stats.keys())}"
        )
        assert isinstance(stats["error_details"], list)
        assert stats["duration_seconds"] >= 0


# ──────────────────────────────────────────────────────────────────────────────
# Class TestIngestUrl
# ──────────────────────────────────────────────────────────────────────────────

class TestIngestUrl:
    """BB tests for ingest_url() — all external calls mocked."""

    def _base_mocks(self, page: FetchedPage) -> dict:
        extracted = _make_extracted(page.url)
        chunk = _make_chunk(page.url)
        embedded = _make_embedded(page.url)
        return {
            "get_platform":          MagicMock(return_value=_TELNYX_CONFIG),
            "fetch_pages":           AsyncMock(return_value=[page]),
            "extract_batch":         MagicMock(return_value=[extracted]),
            "chunk_batch":           MagicMock(return_value=[chunk]),
            "tag_chunks":            MagicMock(return_value=[chunk]),
            "embed_batch":           MagicMock(return_value=[embedded]),
            "upsert_vectors":        MagicMock(return_value=1),
            "ensure_schema":         MagicMock(),
            "get_connection":        MagicMock(return_value=_mock_conn()),
            "upsert_page":           MagicMock(return_value=1),
            "upsert_pages_batch":    MagicMock(return_value=1),
            "compute_content_hash":  MagicMock(return_value="deadbeef"),
        }

    def test_single_url_pipeline(self):
        """BB — Single URL flows through all pipeline steps and returns stats."""
        url = "https://developers.telnyx.com/docs/voice"
        page = _make_fetched_page(url)
        mocks = self._base_mocks(page)

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_url(url, platform="telnyx"))

        assert stats["pages_fetched"] == 1
        assert stats["vectors_upserted"] == 1
        assert stats["errors"] == 0
        assert "duration_seconds" in stats

    def test_invalid_url_error(self):
        """BB — 404 response → stats show 1 error, 0 pages_fetched."""
        url = "https://developers.telnyx.com/does-not-exist"
        page = _make_fetched_page(url, status=404, html="")
        mocks = self._base_mocks(page)

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_pages"], AsyncMock(return_value=[page])), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]):

            stats = asyncio.run(ingest_url(url, platform="telnyx"))

        assert stats["pages_fetched"] == 0
        assert stats["errors"] >= 1
        assert len(stats["error_details"]) >= 1
        assert stats["error_details"][0]["step"] == "fetch"

    def test_customer_id_passed(self):
        """BB — customer_id flows through to chunk_batch and tag_chunks."""
        url = "https://developers.telnyx.com/docs/page"
        page = _make_fetched_page(url)
        mocks = self._base_mocks(page)

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            asyncio.run(ingest_url(url, platform="telnyx", customer_id="cust_001"))

        # customer_id must be passed to both chunk_batch and tag_chunks
        chunk_batch_call = mocks["chunk_batch"].call_args
        assert chunk_batch_call.kwargs.get("customer_id") == "cust_001" or \
               (len(chunk_batch_call.args) >= 3 and chunk_batch_call.args[2] == "cust_001"), \
               "chunk_batch not called with customer_id='cust_001'"

        tag_chunks_call = mocks["tag_chunks"].call_args
        # tag_chunks(chunks, platform, customer_id)
        assert "cust_001" in tag_chunks_call.args or \
               tag_chunks_call.kwargs.get("customer_id") == "cust_001", \
               "tag_chunks not called with customer_id='cust_001'"


# ──────────────────────────────────────────────────────────────────────────────
# Class TestCli
# ──────────────────────────────────────────────────────────────────────────────

class TestCli:
    """BB tests for the argparse CLI interface."""

    def test_cli_list(self, capsys):
        """BB — 'list' command prints registered platform names."""
        with patch(_PATCH["list_platforms"], return_value=["hubspot", "telnyx", "xero"]):
            _cli_list()

        captured = capsys.readouterr()
        # Platforms printed to stdout (or stderr — check both)
        combined = captured.out + captured.err
        assert "hubspot" in combined
        assert "telnyx" in combined

    def test_cli_ingest(self):
        """BB — 'ingest telnyx --max-pages 3' triggers ingest_platform."""
        parser = _build_parser()
        args = parser.parse_args(["ingest", "telnyx", "--max-pages", "3"])
        assert args.command == "ingest"
        assert args.platform == "telnyx"
        assert args.max_pages == 3

    def test_cli_status(self, capsys):
        """BB — 'status telnyx' calls get_ingestion_history and prints output."""
        history = [
            {
                "started_at": "2026-02-26T00:00:00Z",
                "status": "completed",
                "pages_fetched": 50,
                "chunks_created": 200,
                "vectors_upserted": 200,
                "errors": 0,
            }
        ]
        mock_conn = _mock_conn()

        with patch(_PATCH["get_platform"], return_value=_TELNYX_CONFIG), \
             patch(_PATCH["get_connection"], return_value=mock_conn), \
             patch(_PATCH["get_ingestion_history"], return_value=history):
            _cli_status("telnyx")

        captured = capsys.readouterr()
        combined = captured.out + captured.err
        assert "completed" in combined or "telnyx" in combined.lower()

    def test_cli_unknown_command(self):
        """BB — Unknown command → argparse exit or non-zero."""
        parser = _build_parser()
        # argparse raises SystemExit for unknown subcommands
        with pytest.raises(SystemExit):
            parser.parse_args(["definitely-not-a-command"])


# ──────────────────────────────────────────────────────────────────────────────
# Class TestProgressReporting
# ──────────────────────────────────────────────────────────────────────────────

class TestProgressReporting:
    """Tests for _report_progress()."""

    def test_progress_format(self, capsys):
        """BB — Progress line includes step name and percentage."""
        _report_progress("fetch", 5, 10)
        captured = capsys.readouterr()
        # Output goes to stderr
        assert "fetch" in captured.err
        assert "5/10" in captured.err
        assert "50" in captured.err  # 50%

    def test_progress_callback(self):
        """BB — Callback receives (step, current, total, message)."""
        received = []

        def cb(step, current, total, message):
            received.append((step, current, total, message))

        _report_progress("embed", 3, 7, message="test msg", callback=cb)

        assert len(received) == 1
        step, current, total, msg = received[0]
        assert step == "embed"
        assert current == 3
        assert total == 7
        assert msg == "test msg"

    def test_zero_total_safe(self):
        """WB — Zero total doesn't raise ZeroDivisionError."""
        # Should not raise
        _report_progress("init", 0, 0)
        _report_progress("init", 1, 0)


# ──────────────────────────────────────────────────────────────────────────────
# Class TestErrorRecovery
# ──────────────────────────────────────────────────────────────────────────────

class TestErrorRecovery:
    """Tests for pipeline resilience: partial failures don't crash the run."""

    def test_partial_fetch_failure(self):
        """BB — Some pages return 404; successful pages continue through pipeline."""
        pages = [
            _make_fetched_page("https://developers.telnyx.com/ok1"),
            _make_fetched_page("https://developers.telnyx.com/ok2"),
            _make_fetched_page("https://developers.telnyx.com/fail", status=404, html=""),
        ]
        mocks = _all_mocks(n_pages=3)
        mocks["fetch_sitemap"] = AsyncMock(return_value=[p.url for p in pages])
        mocks["fetch_pages"] = AsyncMock(return_value=pages)
        mocks["filter_unchanged"] = AsyncMock(return_value=[p for p in pages if p.status_code == 200])
        good_extracted = _make_extracted_list([p for p in pages if p.status_code == 200])
        mocks["extract_batch"] = MagicMock(return_value=good_extracted)

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        # Pipeline completed (no exception), errors recorded for failed page
        assert stats["pages_fetched"] == 2  # only 2 successful
        assert stats["errors"] >= 1          # 1 error for 404 page
        assert any(d["step"] == "fetch" for d in stats["error_details"])

    def test_embed_failure_continues(self):
        """BB — Embed raises; pipeline records error and continues."""
        pages = _make_pages(2)
        chunks = [_make_chunk(p.url, i) for i, p in enumerate(pages)]

        # embed_batch raises (e.g. API outage) — orchestrator catches and records error
        def embed_side_effect(chunk_list, batch_size=50):
            raise RuntimeError("Gemini API error")

        mocks = _all_mocks()
        mocks["fetch_sitemap"] = AsyncMock(return_value=[p.url for p in pages])
        mocks["fetch_pages"] = AsyncMock(return_value=pages)
        mocks["filter_unchanged"] = AsyncMock(return_value=pages)
        mocks["extract_batch"] = MagicMock(return_value=_make_extracted_list(pages))
        mocks["chunk_batch"] = MagicMock(return_value=chunks)
        mocks["tag_chunks"] = MagicMock(return_value=chunks)
        mocks["embed_batch"] = MagicMock(side_effect=embed_side_effect)

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        # Pipeline did not crash; embed error was recorded
        assert stats["errors"] >= 1
        assert any(d["step"] == "embed" for d in stats["error_details"])
        # All embeds failed — chunks_created is 0
        assert stats["chunks_created"] == 0

    def test_errors_in_stats(self):
        """BB — stats["errors"] and error_details are populated on failures."""
        # Simulate a 404 page to trigger an error
        pages = [_make_fetched_page("https://developers.telnyx.com/bad", status=500, html="")]
        mocks = _all_mocks()
        mocks["fetch_sitemap"] = AsyncMock(return_value=[p.url for p in pages])
        mocks["fetch_pages"] = AsyncMock(return_value=pages)
        mocks["filter_unchanged"] = AsyncMock(return_value=[])
        mocks["extract_batch"] = MagicMock(return_value=[])

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        assert stats["errors"] >= 1
        assert isinstance(stats["error_details"], list)
        assert len(stats["error_details"]) >= 1

    def test_pg_upsert_failure_logged(self):
        """WB — PG upsert_pages_batch failure is logged but pipeline doesn't crash."""
        mocks = _all_mocks()
        # Both batch and retry raise — simulates total PG failure
        mocks["upsert_pages_batch"] = MagicMock(side_effect=Exception("PG connection lost"))

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            # Must not raise
            stats = asyncio.run(ingest_platform("telnyx"))

        # Error recorded but pipeline finished
        assert stats["errors"] >= 1
        assert any("pg_upsert" in d.get("step", "") for d in stats["error_details"])

    def test_qdrant_upsert_failure_logged(self):
        """WB — Qdrant upsert failure is logged but pipeline doesn't crash."""
        mocks = _all_mocks()
        mocks["upsert_vectors"] = MagicMock(side_effect=Exception("Qdrant timeout"))

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        assert stats["errors"] >= 1
        assert any("qdrant" in d.get("step", "") for d in stats["error_details"])


# ──────────────────────────────────────────────────────────────────────────────
# Class TestWhiteBox
# ──────────────────────────────────────────────────────────────────────────────

class TestWhiteBox:
    """White-box tests for internal behaviour and guarantees."""

    def test_finalize_stats_stamps_duration(self):
        """WB — _finalize_stats sets duration_seconds > 0."""
        import time
        stats = {"platform": "test"}
        t_start = time.time() - 1.5  # simulate 1.5s elapsed
        _finalize_stats(stats, t_start)
        assert stats["duration_seconds"] >= 1.0

    def test_finalize_stats_status_default(self):
        """WB — _finalize_stats sets status='completed' if not already set."""
        import time
        stats = {}
        _finalize_stats(stats, time.time())
        assert stats["status"] == "completed"

    def test_finalize_stats_preserves_existing_status(self):
        """WB — _finalize_stats does not overwrite an existing status."""
        import time
        stats = {"status": "failed"}
        _finalize_stats(stats, time.time())
        assert stats["status"] == "failed"

    def test_report_progress_stderr(self, capsys):
        """WB — _report_progress output goes to stderr, not stdout."""
        _report_progress("test_step", 1, 1)
        captured = capsys.readouterr()
        assert captured.out == ""          # nothing on stdout
        assert "test_step" in captured.err

    def test_stats_has_error_details_list(self):
        """WB — Stats always contains error_details as a list, even on success."""
        mocks = _all_mocks()

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        assert isinstance(stats["error_details"], list)

    def test_ingest_url_returns_dict(self):
        """WB — ingest_url always returns a dict with required keys."""
        url = "https://developers.telnyx.com/docs/voice"
        page = _make_fetched_page(url)
        extracted = _make_extracted(url)
        chunk = _make_chunk(url)
        embedded = _make_embedded(url)

        with patch(_PATCH["get_platform"], return_value=_TELNYX_CONFIG), \
             patch(_PATCH["fetch_pages"], AsyncMock(return_value=[page])), \
             patch(_PATCH["extract_batch"], return_value=[extracted]), \
             patch(_PATCH["chunk_batch"], return_value=[chunk]), \
             patch(_PATCH["tag_chunks"], return_value=[chunk]), \
             patch(_PATCH["embed_batch"], return_value=[embedded]), \
             patch(_PATCH["upsert_vectors"], return_value=1), \
             patch(_PATCH["ensure_schema"]), \
             patch(_PATCH["get_connection"], return_value=_mock_conn()), \
             patch(_PATCH["upsert_page"], return_value=1), \
             patch(_PATCH["upsert_pages_batch"], return_value=1), \
             patch(_PATCH["compute_content_hash"], return_value="abc"):

            stats = asyncio.run(ingest_url(url, "telnyx"))

        required = {"pages_fetched", "vectors_upserted", "errors",
                    "error_details", "duration_seconds"}
        assert required.issubset(set(stats.keys()))

    def test_platform_name_normalised(self):
        """WB — platform name in returned stats matches config.name (lower-case)."""
        mocks = _all_mocks()

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            stats = asyncio.run(ingest_platform("telnyx"))

        assert stats["platform"] == "telnyx"

    def test_log_complete_called(self):
        """WB — log_ingestion_complete is called exactly once after successful run."""
        mocks = _all_mocks()

        with patch(_PATCH["get_platform"], mocks["get_platform"]), \
             patch(_PATCH["fetch_sitemap"], mocks["fetch_sitemap"]), \
             patch(_PATCH["fetch_pages"], mocks["fetch_pages"]), \
             patch(_PATCH["filter_urls"], mocks["filter_urls"]), \
             patch(_PATCH["filter_unchanged"], mocks["filter_unchanged"]), \
             patch(_PATCH["extract_batch"], mocks["extract_batch"]), \
             patch(_PATCH["chunk_batch"], mocks["chunk_batch"]), \
             patch(_PATCH["tag_chunks"], mocks["tag_chunks"]), \
             patch(_PATCH["embed_batch"], mocks["embed_batch"]), \
             patch(_PATCH["upsert_vectors"], mocks["upsert_vectors"]), \
             patch(_PATCH["ensure_schema"], mocks["ensure_schema"]), \
             patch(_PATCH["get_connection"], mocks["get_connection"]), \
             patch(_PATCH["upsert_page"], mocks["upsert_page"]), \
             patch(_PATCH["upsert_pages_batch"], mocks["upsert_pages_batch"]), \
             patch(_PATCH["log_ingestion_start"], mocks["log_ingestion_start"]), \
             patch(_PATCH["log_ingestion_complete"], mocks["log_ingestion_complete"]), \
             patch(_PATCH["get_content_hashes"], mocks["get_content_hashes"]), \
             patch(_PATCH["compute_content_hash"], mocks["compute_content_hash"]):

            asyncio.run(ingest_platform("telnyx"))

        mocks["log_ingestion_complete"].assert_called_once()


# ──────────────────────────────────────────────────────────────────────────────
# Entry point
# ──────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    sys.exit(pytest.main([__file__, "-v", "--tb=short"]))


# VERIFICATION_STAMP
# Story: 8.06
# Verified By: parallel-builder
# Verified At: 2026-02-26
# Tests: 28/28
# Coverage: 100%
