#!/usr/bin/env python3
"""
PONTE GOLDMINE 1000 - TEST SUITE
================================

PRD: GENESIS-PONTE-001

Black Box + White Box tests for the Ponte Goldmine extraction pipeline.

Test Categories:
1. Schema Validation - Entry structure correctness
2. Category Coverage - All 10 categories populated
3. Retrieval Tests - Semantic search and filtering
4. Integration Tests - PostgreSQL, Qdrant, Redis, KNOWLEDGE_GRAPH
5. Quality Tests - Deduplication and linking
"""

import json
import sys
import os
from pathlib import Path
from typing import Dict, List
import pytest

# Add genesis paths
GENESIS_ROOT = Path("/mnt/e/genesis-system")
sys.path.insert(0, str(GENESIS_ROOT))
sys.path.insert(0, str(GENESIS_ROOT / "data" / "genesis-memory"))

# Test data paths
KG_PONTE_DIR = GENESIS_ROOT / "KNOWLEDGE_GRAPH" / "ponte"
GOLDMINE_JSON = GENESIS_ROOT / "data" / "ponte_goldmine" / "goldmine_1000.json"


class TestPonteGoldmineSchema:
    """Black Box Tests: Entry schema validation."""

    def test_entry_has_required_fields(self):
        """Each entry must have all required fields."""
        required_fields = [
            "id", "category", "quote", "context", "source_video",
            "source_title", "timestamp", "use_case_tags", "related_entries",
            "confidence", "extracted_at"
        ]

        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            for line in f:
                entry = json.loads(line)
                for field in required_fields:
                    assert field in entry, f"Missing field: {field}"

    def test_entry_id_format(self):
        """Entry IDs must follow ponte_XXXX format."""
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            for line in f:
                entry = json.loads(line)
                assert entry["id"].startswith("ponte_"), f"Invalid ID format: {entry['id']}"
                assert len(entry["id"]) == 18, f"ID wrong length: {entry['id']}"

    def test_quote_not_empty(self):
        """Quotes must not be empty."""
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            for line in f:
                entry = json.loads(line)
                assert len(entry["quote"]) > 5, f"Quote too short: {entry['quote']}"

    def test_confidence_in_range(self):
        """Confidence must be between 0 and 1."""
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            for line in f:
                entry = json.loads(line)
                assert 0 <= entry["confidence"] <= 1, f"Invalid confidence: {entry['confidence']}"

    def test_category_is_valid(self):
        """Category must be one of the 10 defined categories."""
        valid_categories = {
            "sales_scripts", "objection_handlers", "pricing_strategies",
            "cold_openers", "email_templates", "closing_techniques",
            "mindset_frameworks", "tool_tactics", "niche_insights", "axioms"
        }

        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            for line in f:
                entry = json.loads(line)
                assert entry["category"] in valid_categories, f"Invalid category: {entry['category']}"


class TestPonteGoldmineCoverage:
    """Black Box Tests: Category coverage."""

    def test_all_categories_have_entries(self):
        """All 10 categories must have at least some entries."""
        categories = [
            "sales_scripts", "objection_handlers", "pricing_strategies",
            "cold_openers", "email_templates", "closing_techniques",
            "mindset_frameworks", "tool_tactics", "niche_insights", "axioms"
        ]

        for cat in categories:
            cat_file = KG_PONTE_DIR / f"{cat}.jsonl"
            assert cat_file.exists(), f"Missing category file: {cat}"

            with open(cat_file, 'r') as f:
                lines = f.readlines()
                assert len(lines) >= 1, f"Category {cat} has no entries"

    def test_target_entry_count(self):
        """Total entries should meet or exceed target."""
        TARGET = 500  # Minimum acceptable (50% of 1000)

        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            count = sum(1 for _ in f)

        assert count >= TARGET, f"Only {count} entries, target is {TARGET}"

    def test_category_distribution(self):
        """Categories should have reasonable distribution."""
        MIN_PER_CATEGORY = 10  # At least 10 entries per category

        categories = [
            "sales_scripts", "objection_handlers", "pricing_strategies",
            "cold_openers", "email_templates", "closing_techniques",
            "mindset_frameworks", "tool_tactics", "niche_insights", "axioms"
        ]

        for cat in categories:
            cat_file = KG_PONTE_DIR / f"{cat}.jsonl"
            with open(cat_file, 'r') as f:
                count = sum(1 for _ in f)
            assert count >= MIN_PER_CATEGORY, f"Category {cat} has only {count} entries"


class TestPonteGoldmineRetrieval:
    """Black Box Tests: Retrieval functionality."""

    def test_search_by_category(self):
        """Can filter entries by category."""
        # Load sales scripts
        sales_scripts = []
        with open(KG_PONTE_DIR / "sales_scripts.jsonl", 'r') as f:
            for line in f:
                entry = json.loads(line)
                sales_scripts.append(entry)
                assert entry["category"] == "sales_scripts"

        assert len(sales_scripts) >= 10, "Should have enough sales scripts"

    def test_search_by_source_video(self):
        """Can filter entries by source video."""
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            entries = [json.loads(line) for line in f]

        # Group by source video
        by_video = {}
        for entry in entries:
            vid = entry["source_video"]
            if vid not in by_video:
                by_video[vid] = []
            by_video[vid].append(entry)

        # Multiple videos should have entries
        assert len(by_video) >= 5, f"Only {len(by_video)} videos have entries"

    def test_related_entries_exist(self):
        """Related entries should reference valid IDs."""
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            entries = {json.loads(line)["id"]: json.loads(line) for line in f}

        # Reread since we consumed the file
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            entries_list = [json.loads(line) for line in f]

        all_ids = set(entries.keys())

        for entry in entries_list:
            for related_id in entry.get("related_entries", []):
                assert related_id in all_ids, f"Related entry {related_id} not found"


class TestPonteGoldmineIntegration:
    """White Box Tests: Integration with Genesis infrastructure."""

    @pytest.fixture(autouse=True)
    def setup(self):
        """Setup connections."""
        self.pg_available = False
        self.qdrant_available = False
        self.redis_available = False

        try:
            from elestio_config import PostgresConfig
            import psycopg2
            self.pg_conn = psycopg2.connect(**PostgresConfig.get_connection_params())
            self.pg_available = True
        except:
            pass

        try:
            from qdrant_client import QdrantClient
            from elestio_config import QdrantConfig
            self.qdrant = QdrantClient(host=QdrantConfig.host, port=QdrantConfig.port)
            self.qdrant_available = True
        except:
            pass

        try:
            import redis
            from elestio_config import RedisConfig
            self.redis_client = redis.Redis(host=RedisConfig.host, port=RedisConfig.port)
            self.redis_client.ping()
            self.redis_available = True
        except:
            pass

    def test_postgresql_entries_exist(self):
        """PostgreSQL should have ponte_goldmine entries."""
        if not self.pg_available:
            pytest.skip("PostgreSQL not available")

        cursor = self.pg_conn.cursor()
        cursor.execute("SELECT COUNT(*) FROM ponte_goldmine")
        count = cursor.fetchone()[0]
        cursor.close()

        assert count >= 100, f"PostgreSQL has only {count} entries"

    def test_postgresql_category_index(self):
        """PostgreSQL category index should work."""
        if not self.pg_available:
            pytest.skip("PostgreSQL not available")

        cursor = self.pg_conn.cursor()
        cursor.execute("""
            SELECT category, COUNT(*)
            FROM ponte_goldmine
            GROUP BY category
        """)
        results = cursor.fetchall()
        cursor.close()

        # Should have entries in multiple categories
        assert len(results) >= 5, "Not enough categories in PostgreSQL"

    def test_redis_cache_exists(self):
        """Redis should have cached entries."""
        if not self.redis_available:
            pytest.skip("Redis not available")

        # Check for ponte keys
        keys = self.redis_client.keys("ponte:*")
        assert len(keys) >= 50, f"Redis has only {len(keys)} ponte keys"

    def test_knowledge_graph_files_valid(self):
        """All KNOWLEDGE_GRAPH files should be valid JSON."""
        for jsonl_file in KG_PONTE_DIR.glob("*.jsonl"):
            with open(jsonl_file, 'r') as f:
                for line_num, line in enumerate(f, 1):
                    try:
                        json.loads(line)
                    except json.JSONDecodeError as e:
                        pytest.fail(f"Invalid JSON in {jsonl_file.name} line {line_num}: {e}")


class TestPonteGoldmineQuality:
    """White Box Tests: Data quality."""

    def test_no_duplicate_ids(self):
        """Entry IDs must be unique."""
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            ids = [json.loads(line)["id"] for line in f]

        assert len(ids) == len(set(ids)), "Duplicate IDs found"

    def test_no_duplicate_quotes(self):
        """Quotes should be mostly unique (allow some overlap)."""
        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            quotes = [json.loads(line)["quote"][:100].lower() for line in f]

        unique_quotes = len(set(quotes))
        total_quotes = len(quotes)
        uniqueness_ratio = unique_quotes / total_quotes

        assert uniqueness_ratio >= 0.8, f"Only {uniqueness_ratio:.0%} unique quotes"

    def test_quotes_have_substance(self):
        """Quotes should have meaningful content."""
        MIN_QUOTE_LENGTH = 20
        MIN_WORDS = 4

        with open(KG_PONTE_DIR / "all_entries.jsonl", 'r') as f:
            for line in f:
                entry = json.loads(line)
                quote = entry["quote"]

                # Skip very short entries with high confidence (likely intentional)
                if entry["confidence"] >= 0.9 and len(quote) < MIN_QUOTE_LENGTH:
                    continue

                word_count = len(quote.split())
                assert word_count >= MIN_WORDS or len(quote) >= MIN_QUOTE_LENGTH, \
                    f"Quote too short: {quote}"


class TestPonteGoldmineVerification:
    """Verification stamp tests."""

    def test_extraction_completed(self):
        """Extraction should have completed successfully."""
        if GOLDMINE_JSON.exists():
            with open(GOLDMINE_JSON, 'r') as f:
                data = json.load(f)

            assert "extracted_at" in data
            assert "total_entries" in data
            assert data["total_entries"] >= 100, f"Only {data['total_entries']} entries"

    def test_verification_stamp_present(self):
        """PRD verification stamp should be in output."""
        log_file = GENESIS_ROOT / "data" / "ponte_goldmine" / "extraction_log.txt"

        if log_file.exists():
            with open(log_file, 'r') as f:
                content = f.read()

            assert "VERIFICATION_STAMP" in content
            assert "GENESIS-PONTE-001" in content


def run_tests():
    """Run all tests and generate report."""
    import subprocess

    result = subprocess.run(
        ["python", "-m", "pytest", __file__, "-v", "--tb=short"],
        capture_output=True,
        text=True
    )

    print(result.stdout)
    print(result.stderr)

    return result.returncode == 0


if __name__ == "__main__":
    # Quick test run
    print("="*70)
    print("PONTE GOLDMINE TEST SUITE")
    print("="*70)

    # Check if extraction files exist
    if not KG_PONTE_DIR.exists():
        print("ERROR: KNOWLEDGE_GRAPH/ponte directory not found")
        print("Run the extraction first!")
        sys.exit(1)

    # Run tests
    success = run_tests()

    if success:
        print("\n" + "="*70)
        print("ALL TESTS PASSED")
        print("="*70)
    else:
        print("\n" + "="*70)
        print("SOME TESTS FAILED")
        print("="*70)