"""
Comprehensive test suite for EntityExtractor (Story KG-002)

This test suite covers:
- Black-box tests: Test from outside without implementation knowledge
- White-box tests: Test internal methods and code paths
- Edge cases: Empty input, malformed text, unicode, etc.
- Multi-source extraction: markdown, code, conversations
- Confidence scoring validation
- NER vs regex fallback behavior

Author: Genesis System
Date: 2026-01-25
Story: KG-002
"""

import unittest
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))

from core.knowledge.entity_extractor import EntityExtractor


class TestEntityExtractorBlackBox(unittest.TestCase):
    """
    BLACK-BOX TESTS
    Test the entity extractor from outside without implementation knowledge.
    Focus on input/output behavior and acceptance criteria.
    """

    def setUp(self):
        """Set up test fixtures."""
        self.extractor = EntityExtractor(
            system_names=["Genesis", "AIVA", "Claude"],
            api_names=["create_user", "delete_user", "extract_entities"]
        )

    def test_extract_basic_entities(self):
        """Test basic entity extraction from simple text."""
        text = "The Genesis system uses the create_user API."
        entities = self.extractor.extract_entities(text)

        # Should extract at least system and API names
        self.assertGreater(len(entities), 0, "Should extract entities from text")

        # Check for expected entities
        types = [e['type'] for e in entities]
        self.assertIn('SYSTEM', types, "Should extract SYSTEM entities")
        self.assertIn('API', types, "Should extract API entities")

    def test_extract_from_markdown_document(self):
        """Acceptance Criteria: Extract entities from markdown documents."""
        markdown = """
# Genesis System Documentation

The [Genesis](https://genesis.ai) system was created by Kinan.
Contact: info@genesis.ai

## Features
- Entity extraction
- Knowledge graph building
        """
        entities = self.extractor.extract_from_markdown(markdown)

        # Should extract various entity types from markdown
        self.assertGreater(len(entities), 0, "Should extract entities from markdown")

        types = [e['type'] for e in entities]
        self.assertIn('MARKDOWN_HEADER', types, "Should extract markdown headers")

    def test_extract_from_code_comments(self):
        """Acceptance Criteria: Extract entities from code comments."""
        code = """
# Genesis entity extraction module
# Author: Kinan
# API: extract_entities

def extract_entities(text):
    '''Extract entities from text using NER.'''
    pass

class EntityExtractor:
    pass
        """
        entities = self.extractor.extract_from_code(code, language="python")

        # Should extract from comments and code structure
        self.assertGreater(len(entities), 0, "Should extract entities from code")

        types = [e['type'] for e in entities]
        # Should extract functions and classes
        self.assertTrue(
            'FUNCTION' in types or 'CLASS' in types,
            "Should extract code structure entities"
        )

    def test_extract_from_conversation_logs(self):
        """Acceptance Criteria: Extract entities from conversation logs."""
        conversation = """
User: Can you help me with Genesis?
Assistant: Yes, the Genesis system is located at /mnt/e/genesis-system.
Claude: I can help with that. Contact Kinan for more info.
        """
        entities = self.extractor.extract_from_conversation(conversation)

        # Should extract entities from conversation
        self.assertGreater(len(entities), 0, "Should extract entities from conversation")

        types = [e['type'] for e in entities]
        self.assertIn('SPEAKER', types, "Should identify speakers in conversation")

    def test_confidence_scoring_exists(self):
        """Acceptance Criteria: Confidence scoring for extracted entities."""
        text = "Genesis system at info@genesis.ai"
        entities = self.extractor.extract_entities(text)

        # All entities must have confidence scores
        for entity in entities:
            self.assertIn('confidence', entity, "Entity should have confidence field")
            self.assertIsInstance(entity['confidence'], (int, float), "Confidence should be numeric")
            self.assertGreaterEqual(entity['confidence'], 0.0, "Confidence should be >= 0.0")
            self.assertLessEqual(entity['confidence'], 1.0, "Confidence should be <= 1.0")

    def test_standardized_output_format(self):
        """Test that all entities have standardized fields."""
        text = "Genesis uses create_user API. Contact: info@genesis.ai"
        entities = self.extractor.extract_entities(text)

        required_fields = ['type', 'name', 'confidence', 'source', 'location', 'method']

        for entity in entities:
            for field in required_fields:
                self.assertIn(field, entity, f"Entity should have '{field}' field")

    def test_empty_input_handling(self):
        """Edge case: Handle empty input gracefully."""
        entities = self.extractor.extract_entities("")
        self.assertEqual(len(entities), 0, "Empty input should return empty list")

    def test_malformed_text_handling(self):
        """Edge case: Handle malformed/garbage text."""
        malformed = "!@#$%^&*()_+{}|:<>?~`"
        entities = self.extractor.extract_entities(malformed)
        # Should not crash, may return empty or minimal results
        self.assertIsInstance(entities, list, "Should return list even for malformed input")

    def test_unicode_text_handling(self):
        """Edge case: Handle unicode characters."""
        unicode_text = "Genesis système créé par Kinan 日本語"
        entities = self.extractor.extract_entities(unicode_text)
        # Should not crash with unicode
        self.assertIsInstance(entities, list, "Should handle unicode text")

    def test_very_long_text_handling(self):
        """Edge case: Handle very long text."""
        long_text = "Genesis system. " * 10000  # Very long text
        entities = self.extractor.extract_entities(long_text)
        # Should extract entities even from long text
        self.assertGreater(len(entities), 0, "Should handle long text")


class TestEntityExtractorWhiteBox(unittest.TestCase):
    """
    WHITE-BOX TESTS
    Test internal implementation details and code paths.
    Focus on individual methods and branches.
    """

    def setUp(self):
        """Set up test fixtures."""
        self.extractor = EntityExtractor(
            system_names=["Genesis"],
            api_names=["test_api"],
            use_spacy=True  # Try to use spaCy
        )

    def test_spacy_fallback_mechanism(self):
        """Test spaCy fallback to regex when unavailable."""
        # Create extractor with spaCy disabled
        extractor_no_spacy = EntityExtractor(
            system_names=["Genesis"],
            use_spacy=False
        )

        self.assertFalse(extractor_no_spacy.use_spacy, "spaCy should be disabled")
        self.assertIsNone(extractor_no_spacy.nlp, "NLP model should be None")

        # Should still extract using regex
        text = "Genesis system"
        entities = extractor_no_spacy.extract_entities(text)
        self.assertGreater(len(entities), 0, "Should extract with regex fallback")

    def test_extract_with_regex_method(self):
        """White-box: Test _extract_with_regex method directly."""
        text = "Genesis system at /mnt/e/genesis-system. Email: test@example.com"
        entities = self.extractor._extract_with_regex(text, source="test")

        # Check that regex extraction returns entities
        self.assertGreater(len(entities), 0, "Regex should extract entities")

        # Verify all entities have required fields
        for entity in entities:
            self.assertIn('type', entity)
            self.assertIn('name', entity)
            self.assertIn('confidence', entity)
            self.assertIn('method', entity)
            self.assertEqual(entity['method'], 'regex', "Method should be 'regex'")

    def test_spacy_extraction_method(self):
        """White-box: Test _extract_with_spacy method if available."""
        if not self.extractor.use_spacy:
            self.skipTest("spaCy not available")

        text = "John Doe works at Microsoft in Seattle."
        entities = self.extractor._extract_with_spacy(text, source="test")

        # spaCy should extract PERSON, ORG, GPE entities
        if len(entities) > 0:
            for entity in entities:
                self.assertEqual(entity['method'], 'spacy', "Method should be 'spacy'")
                self.assertGreater(entity['confidence'], 0.0, "Confidence should be > 0")

    def test_deduplication_logic(self):
        """White-box: Test _deduplicate_entities method."""
        # Create duplicate entities with different confidence
        entities = [
            {'type': 'SYSTEM', 'name': 'Genesis', 'confidence': 0.9, 'location': '0:7', 'source': 'test', 'method': 'regex'},
            {'type': 'SYSTEM', 'name': 'genesis', 'confidence': 0.7, 'location': '0:7', 'source': 'test', 'method': 'regex'},
            {'type': 'SYSTEM', 'name': 'GENESIS', 'confidence': 0.8, 'location': '0:7', 'source': 'test', 'method': 'regex'},
        ]

        deduplicated = self.extractor._deduplicate_entities(entities)

        # Should keep only one (highest confidence)
        self.assertEqual(len(deduplicated), 1, "Should remove duplicates")
        self.assertEqual(deduplicated[0]['confidence'], 0.9, "Should keep highest confidence")

    def test_confidence_scoring_ranges(self):
        """White-box: Validate confidence scoring for different entity types."""
        text = "Genesis system API: create_user. File: /test/file.py. Password: secret123. John Doe."

        entities = self.extractor._extract_with_regex(text, source="test")

        # Check confidence ranges for different types
        confidence_by_type = {}
        for entity in entities:
            entity_type = entity['type']
            if entity_type not in confidence_by_type:
                confidence_by_type[entity_type] = []
            confidence_by_type[entity_type].append(entity['confidence'])

        # Known entities (SYSTEM, API) should have high confidence (0.95)
        if 'SYSTEM' in confidence_by_type:
            self.assertEqual(confidence_by_type['SYSTEM'][0], 0.95, "SYSTEM should have 0.95 confidence")

        if 'API' in confidence_by_type:
            self.assertEqual(confidence_by_type['API'][0], 0.95, "API should have 0.95 confidence")

    def test_regex_patterns_for_emails(self):
        """White-box: Test email extraction pattern."""
        text = "Contact: user@example.com, admin@test.org"
        entities = self.extractor._extract_with_regex(text, source="test")

        email_entities = [e for e in entities if e['type'] == 'EMAIL']
        self.assertGreaterEqual(len(email_entities), 2, "Should extract 2 emails")

    def test_regex_patterns_for_urls(self):
        """White-box: Test URL extraction pattern."""
        text = "Visit https://genesis.ai or http://www.example.com"
        entities = self.extractor._extract_with_regex(text, source="test")

        url_entities = [e for e in entities if e['type'] == 'URL']
        self.assertGreaterEqual(len(url_entities), 2, "Should extract 2 URLs")

    def test_regex_patterns_for_files(self):
        """White-box: Test file path extraction pattern."""
        text = "/mnt/e/genesis-system/test.py and C:\\Users\\test\\file.txt"
        entities = self.extractor._extract_with_regex(text, source="test")

        file_entities = [e for e in entities if e['type'] == 'FILE']
        self.assertGreaterEqual(len(file_entities), 2, "Should extract 2 file paths")

    def test_code_extraction_functions(self):
        """White-box: Test function extraction from code."""
        code = """
def test_function():
    pass

def another_function(param):
    return param
        """
        entities = self.extractor.extract_from_code(code, language="python")

        function_entities = [e for e in entities if e['type'] == 'FUNCTION']
        self.assertEqual(len(function_entities), 2, "Should extract 2 functions")

        function_names = [e['name'] for e in function_entities]
        self.assertIn('test_function', function_names, "Should extract test_function")
        self.assertIn('another_function', function_names, "Should extract another_function")

    def test_code_extraction_classes(self):
        """White-box: Test class extraction from code."""
        code = """
class TestClass:
    pass

class AnotherClass(BaseClass):
    def method(self):
        pass
        """
        entities = self.extractor.extract_from_code(code, language="python")

        class_entities = [e for e in entities if e['type'] == 'CLASS']
        self.assertEqual(len(class_entities), 2, "Should extract 2 classes")

        class_names = [e['name'] for e in class_entities]
        self.assertIn('TestClass', class_names, "Should extract TestClass")
        self.assertIn('AnotherClass', class_names, "Should extract AnotherClass")

    def test_markdown_link_extraction(self):
        """White-box: Test markdown link extraction."""
        markdown = "[Genesis](https://genesis.ai) and [Test](http://test.com)"
        entities = self.extractor.extract_from_markdown(markdown)

        link_entities = [e for e in entities if e['type'] == 'MARKDOWN_LINK']
        self.assertEqual(len(link_entities), 2, "Should extract 2 markdown links")

    def test_markdown_header_extraction(self):
        """White-box: Test markdown header extraction."""
        markdown = """
# Header 1
## Header 2
### Header 3
        """
        entities = self.extractor.extract_from_markdown(markdown)

        header_entities = [e for e in entities if e['type'] == 'MARKDOWN_HEADER']
        self.assertEqual(len(header_entities), 3, "Should extract 3 headers")

    def test_conversation_speaker_extraction(self):
        """White-box: Test speaker identification in conversations."""
        conversation = """
User: Hello
Assistant: Hi there
Claude: How can I help?
        """
        entities = self.extractor.extract_from_conversation(conversation)

        speaker_entities = [e for e in entities if e['type'] == 'SPEAKER']
        self.assertEqual(len(speaker_entities), 3, "Should extract 3 speakers")

        speaker_names = [e['name'] for e in speaker_entities]
        self.assertIn('User', speaker_names, "Should extract User speaker")
        self.assertIn('Assistant', speaker_names, "Should extract Assistant speaker")
        self.assertIn('Claude', speaker_names, "Should extract Claude speaker")


class TestEntityExtractorIntegration(unittest.TestCase):
    """
    INTEGRATION TESTS
    Test complete workflows and interactions between components.
    """

    def test_end_to_end_markdown_extraction(self):
        """Integration: Complete workflow for markdown document processing."""
        extractor = EntityExtractor(
            system_names=["Genesis", "AIVA"],
            api_names=["extract_entities"]
        )

        markdown = """
# Genesis Knowledge Graph System

The Genesis system uses the extract_entities API to process documents.
Created by Kinan, contact at info@genesis.ai.

## Files
- /mnt/e/genesis-system/core/knowledge/entity_extractor.py

Visit [Genesis](https://genesis.ai) for more info.
        """

        entities = extractor.extract_from_markdown(markdown)

        # Should extract multiple types of entities
        types = set(e['type'] for e in entities)
        self.assertGreater(len(types), 3, "Should extract multiple entity types")

        # Verify entity structure
        for entity in entities:
            self.assertIn('confidence', entity, "All entities should have confidence")
            self.assertGreaterEqual(entity['confidence'], 0.0)
            self.assertLessEqual(entity['confidence'], 1.0)

    def test_end_to_end_code_extraction(self):
        """Integration: Complete workflow for code processing."""
        extractor = EntityExtractor(system_names=["Genesis"])

        code = """
# Genesis entity extraction module
# Uses the Genesis system for NER

def extract_entities(text):
    '''
    Extract entities from text.

    Args:
        text: Input text to process

    Returns:
        List of entities
    '''
    return []

class EntityExtractor:
    '''Main entity extraction class.'''
    pass
        """

        entities = extractor.extract_from_code(code, language="python")

        # Should extract functions, classes, and entities from comments
        types = set(e['type'] for e in entities)
        self.assertTrue(
            'FUNCTION' in types or 'CLASS' in types,
            "Should extract code structure"
        )

    def test_mixed_extraction_methods(self):
        """Integration: Test that both spaCy and regex work together."""
        extractor = EntityExtractor(
            system_names=["Genesis"],
            use_spacy=True
        )

        text = "Genesis system created by John Doe at info@genesis.ai"
        entities = extractor.extract_entities(text)

        # Should have entities from both methods (if spaCy available)
        methods = set(e['method'] for e in entities)

        # At minimum, regex should work
        self.assertIn('regex', methods, "Should have regex extraction")

        # If spaCy is available, should also have spacy method
        # (This is conditional based on environment)


class TestEntityExtractorEdgeCases(unittest.TestCase):
    """
    EDGE CASE TESTS
    Test boundary conditions and unusual inputs.
    """

    def setUp(self):
        """Set up test fixtures."""
        self.extractor = EntityExtractor()

    def test_none_input_handling(self):
        """Edge case: None as input."""
        with self.assertRaises(Exception):
            self.extractor.extract_entities(None)

    def test_numeric_only_input(self):
        """Edge case: Only numbers."""
        entities = self.extractor.extract_entities("123456789")
        self.assertIsInstance(entities, list, "Should return list")

    def test_special_characters_only(self):
        """Edge case: Only special characters."""
        entities = self.extractor.extract_entities("!@#$%^&*()")
        self.assertIsInstance(entities, list, "Should return list")

    def test_whitespace_only_input(self):
        """Edge case: Only whitespace."""
        entities = self.extractor.extract_entities("   \n\t\r  ")
        self.assertEqual(len(entities), 0, "Whitespace should yield no entities")

    def test_single_character_input(self):
        """Edge case: Single character."""
        entities = self.extractor.extract_entities("a")
        self.assertIsInstance(entities, list, "Should return list")

    def test_repeated_entities(self):
        """Edge case: Same entity repeated many times."""
        extractor = EntityExtractor(system_names=["Genesis"])
        text = "Genesis " * 100
        entities = extractor.extract_entities(text)

        # Should deduplicate
        genesis_entities = [e for e in entities if e['name'] == 'Genesis']
        self.assertEqual(len(genesis_entities), 1, "Should deduplicate repeated entities")

    def test_overlapping_patterns(self):
        """Edge case: Overlapping regex patterns."""
        text = "user@example.com is at https://example.com"
        entities = self.extractor.extract_entities(text)

        # Should extract both email and URL even though they share domain
        types = [e['type'] for e in entities]
        self.assertIn('EMAIL', types, "Should extract email")
        self.assertIn('URL', types, "Should extract URL")

    def test_case_sensitivity(self):
        """Edge case: Different cases of same entity."""
        extractor = EntityExtractor(system_names=["Genesis"])
        text = "Genesis GENESIS genesis GeNeSiS"
        entities = extractor.extract_entities(text)

        # Should be case-insensitive in matching
        genesis_entities = [
            e for e in entities
            if e['type'] == 'SYSTEM' and e['name'].lower() == 'genesis'
        ]
        # After deduplication, should have 1
        self.assertEqual(len(genesis_entities), 1, "Should handle case-insensitive matching")


def run_tests():
    """Run all tests and return results."""
    # Create test suite
    loader = unittest.TestLoader()
    suite = unittest.TestSuite()

    # Add all test classes
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtractorBlackBox))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtractorWhiteBox))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtractorIntegration))
    suite.addTests(loader.loadTestsFromTestCase(TestEntityExtractorEdgeCases))

    # Run tests with verbose output
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)

    return result


if __name__ == '__main__':
    print("\n" + "="*80)
    print("ENTITY EXTRACTOR TEST SUITE - Story KG-002")
    print("="*80 + "\n")

    result = run_tests()

    print("\n" + "="*80)
    print("TEST SUMMARY")
    print("="*80)
    print(f"Tests run: {result.testsRun}")
    print(f"Successes: {result.testsRun - len(result.failures) - len(result.errors)}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    print(f"Success Rate: {((result.testsRun - len(result.failures) - len(result.errors)) / result.testsRun * 100):.1f}%")
    print("="*80 + "\n")

    # Exit with appropriate code
    sys.exit(0 if result.wasSuccessful() else 1)