"""
AIVA Knowledge Ingestion Pipeline - PM-029

Ingests new knowledge into AIVA memory.
Extracts entities, axioms, and relationships from documents.
"""

import os
import json
import logging
import re
from datetime import datetime
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, asdict, field
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


@dataclass
class Entity:
    """Extracted entity from document."""
    entity_id: str
    entity_type: str
    name: str
    properties: Dict = field(default_factory=dict)
    source_document: str = ""
    confidence: float = 1.0


@dataclass
class Axiom:
    """Extracted axiom/fact from document."""
    axiom_id: str
    subject: str
    predicate: str
    object: str
    confidence: float = 1.0
    source_document: str = ""


@dataclass
class Relationship:
    """Relationship between entities."""
    source_entity: str
    target_entity: str
    relationship_type: str
    properties: Dict = field(default_factory=dict)


@dataclass
class IngestionResult:
    """Result from document ingestion."""
    document_id: str
    document_path: str
    entities: List[Entity]
    axioms: List[Axiom]
    relationships: List[Relationship]
    ingested_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
    success: bool = True
    error: Optional[str] = None


class EntityExtractor:
    """Extracts entities from text."""

    # Common entity patterns
    PATTERNS = {
        "SYSTEM": [r'\b(Genesis|AIVA|Claude|Gemini|Redis|PostgreSQL|Qdrant)\b'],
        "API": [r'\b(\w+(?:_\w+)*(?:API|api|Skill|skill))\b'],
        "FILE": [r'([A-Za-z]:\\|/)[\w/\\.-]+\.\w+'],
        "CONFIG": [r'\b(\w+(?:_KEY|_TOKEN|_SECRET|_CONFIG))\b'],
        "PERSON": [r'\b([A-Z][a-z]+\s[A-Z][a-z]+)\b'],
        "DOMAIN": [r'\b(\w+\.(?:com|io|ai|dev|org))\b'],
    }

    def extract(self, text: str, source: str = "") -> List[Entity]:
        """
        Extract entities from text.

        Args:
            text: Text to extract from
            source: Source document name

        Returns:
            List of extracted Entity objects
        """
        entities = []
        seen = set()

        for entity_type, patterns in self.PATTERNS.items():
            for pattern in patterns:
                for match in re.finditer(pattern, text):
                    name = match.group(1) if match.lastindex else match.group(0)
                    if name.lower() not in seen:
                        seen.add(name.lower())
                        entities.append(Entity(
                            entity_id=f"e_{hash(name) % 100000}",
                            entity_type=entity_type,
                            name=name,
                            source_document=source,
                            confidence=0.8
                        ))

        return entities


class AxiomExtractor:
    """Extracts axioms/facts from text."""

    def extract(self, text: str, source: str = "") -> List[Axiom]:
        """
        Extract axioms from text.

        Args:
            text: Text to extract from
            source: Source document name

        Returns:
            List of extracted Axiom objects
        """
        axioms = []

        # Pattern: "X is Y" statements
        is_pattern = r'(\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+is\s+(?:a\s+)?([^.]+)\.'
        for match in re.finditer(is_pattern, text):
            axioms.append(Axiom(
                axiom_id=f"a_{hash(match.group(0)) % 100000}",
                subject=match.group(1),
                predicate="is",
                object=match.group(2).strip(),
                source_document=source
            ))

        # Pattern: "X uses Y" statements
        uses_pattern = r'(\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+uses\s+([^.]+)\.'
        for match in re.finditer(uses_pattern, text):
            axioms.append(Axiom(
                axiom_id=f"a_{hash(match.group(0)) % 100000}",
                subject=match.group(1),
                predicate="uses",
                object=match.group(2).strip(),
                source_document=source
            ))

        # Pattern: "X can Y" statements
        can_pattern = r'(\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+can\s+([^.]+)\.'
        for match in re.finditer(can_pattern, text):
            axioms.append(Axiom(
                axiom_id=f"a_{hash(match.group(0)) % 100000}",
                subject=match.group(1),
                predicate="can",
                object=match.group(2).strip(),
                source_document=source
            ))

        return axioms


class RelationshipMapper:
    """Maps relationships between entities."""

    def map_relationships(self, entities: List[Entity], text: str) -> List[Relationship]:
        """
        Map relationships between entities.

        Args:
            entities: Extracted entities
            text: Source text

        Returns:
            List of Relationship objects
        """
        relationships = []

        # Simple co-occurrence based relationship
        entity_names = {e.name.lower(): e for e in entities}

        # Check for entities mentioned together in sentences
        sentences = re.split(r'[.!?]', text)
        for sentence in sentences:
            sentence_lower = sentence.lower()
            found_entities = [
                e for name, e in entity_names.items()
                if name in sentence_lower
            ]

            # Create relationships between co-occurring entities
            for i, e1 in enumerate(found_entities):
                for e2 in found_entities[i+1:]:
                    relationships.append(Relationship(
                        source_entity=e1.entity_id,
                        target_entity=e2.entity_id,
                        relationship_type="mentioned_with",
                        properties={"context": sentence[:100]}
                    ))

        return relationships


class KnowledgeIngestionPipeline:
    """
    Main knowledge ingestion pipeline.

    Usage:
        pipeline = KnowledgeIngestionPipeline(memory_bridge)
        result = pipeline.ingest("/path/to/document.md")
    """

    def __init__(self, memory_bridge=None):
        """
        Initialize the pipeline.

        Args:
            memory_bridge: MemoryBridge for storing knowledge
        """
        self.memory_bridge = memory_bridge
        self.entity_extractor = EntityExtractor()
        self.axiom_extractor = AxiomExtractor()
        self.relationship_mapper = RelationshipMapper()
        self.ingestion_log: List[IngestionResult] = []
        logger.info("KnowledgeIngestionPipeline initialized")

    def ingest(self, document_path: str) -> IngestionResult:
        """
        Ingest a document into knowledge base.

        Args:
            document_path: Path to document

        Returns:
            IngestionResult with extracted knowledge
        """
        path = Path(document_path)
        document_id = f"doc_{hash(str(path)) % 100000}"

        logger.info(f"Ingesting document: {path.name}")

        try:
            # Read document
            text = self._read_document(path)

            # Extract entities
            entities = self.entity_extractor.extract(text, path.name)
            logger.debug(f"Extracted {len(entities)} entities")

            # Extract axioms
            axioms = self.axiom_extractor.extract(text, path.name)
            logger.debug(f"Extracted {len(axioms)} axioms")

            # Map relationships
            relationships = self.relationship_mapper.map_relationships(entities, text)
            logger.debug(f"Mapped {len(relationships)} relationships")

            result = IngestionResult(
                document_id=document_id,
                document_path=str(path),
                entities=entities,
                axioms=axioms,
                relationships=relationships
            )

            # Store in RLM
            if self.memory_bridge:
                self._store_in_rlm(result)

            self.ingestion_log.append(result)
            logger.info(
                f"Ingested {path.name}: "
                f"{len(entities)} entities, {len(axioms)} axioms, {len(relationships)} relationships"
            )

            return result

        except Exception as e:
            logger.error(f"Ingestion failed: {e}")
            result = IngestionResult(
                document_id=document_id,
                document_path=str(path),
                entities=[],
                axioms=[],
                relationships=[],
                success=False,
                error=str(e)
            )
            self.ingestion_log.append(result)
            return result

    def ingest_text(self, text: str, source_name: str = "direct_input") -> IngestionResult:
        """
        Ingest text directly (not from file).

        Args:
            text: Text content to ingest
            source_name: Name for the source

        Returns:
            IngestionResult
        """
        document_id = f"text_{hash(text[:100]) % 100000}"

        entities = self.entity_extractor.extract(text, source_name)
        axioms = self.axiom_extractor.extract(text, source_name)
        relationships = self.relationship_mapper.map_relationships(entities, text)

        result = IngestionResult(
            document_id=document_id,
            document_path=source_name,
            entities=entities,
            axioms=axioms,
            relationships=relationships
        )

        if self.memory_bridge:
            self._store_in_rlm(result)

        self.ingestion_log.append(result)
        return result

    def _read_document(self, path: Path) -> str:
        """Read document content."""
        suffix = path.suffix.lower()

        if suffix in ['.txt', '.md', '.py', '.json', '.yaml', '.yml']:
            return path.read_text(encoding='utf-8')
        elif suffix == '.pdf':
            # Would use PyPDF2 or similar
            logger.warning(f"PDF reading not implemented, skipping {path}")
            return ""
        elif suffix in ['.docx', '.doc']:
            # Would use python-docx
            logger.warning(f"Word document reading not implemented, skipping {path}")
            return ""
        else:
            return path.read_text(encoding='utf-8', errors='ignore')

    def _store_in_rlm(self, result: IngestionResult) -> None:
        """Store ingestion result in RLM."""
        try:
            # Store entities
            for entity in result.entities:
                self.memory_bridge.store_memory(
                    content=asdict(entity),
                    memory_type="entity",
                    metadata={
                        "entity_type": entity.entity_type,
                        "name": entity.name,
                        "id": entity.entity_id
                    }
                )

            # Store axioms
            for axiom in result.axioms:
                self.memory_bridge.store_memory(
                    content=asdict(axiom),
                    memory_type="entity",
                    metadata={
                        "entity_type": "axiom",
                        "name": f"{axiom.subject} {axiom.predicate} {axiom.object}",
                        "id": axiom.axiom_id
                    }
                )

            logger.debug(f"Stored {len(result.entities)} entities and {len(result.axioms)} axioms in RLM")

        except Exception as e:
            logger.error(f"Failed to store in RLM: {e}")

    def get_stats(self) -> Dict:
        """Get ingestion statistics."""
        total = len(self.ingestion_log)
        successful = sum(1 for r in self.ingestion_log if r.success)
        total_entities = sum(len(r.entities) for r in self.ingestion_log)
        total_axioms = sum(len(r.axioms) for r in self.ingestion_log)
        total_relationships = sum(len(r.relationships) for r in self.ingestion_log)

        return {
            "documents_ingested": total,
            "successful": successful,
            "failed": total - successful,
            "total_entities": total_entities,
            "total_axioms": total_axioms,
            "total_relationships": total_relationships
        }


# Singleton instance
_pipeline: Optional[KnowledgeIngestionPipeline] = None


def get_knowledge_pipeline(memory_bridge=None) -> KnowledgeIngestionPipeline:
    """Get or create singleton pipeline."""
    global _pipeline
    if _pipeline is None:
        _pipeline = KnowledgeIngestionPipeline(memory_bridge)
    return _pipeline


if __name__ == "__main__":
    # Example usage
    pipeline = KnowledgeIngestionPipeline()

    # Ingest some sample text
    sample_text = """
    Genesis is an autonomous AI system. AIVA is the Queen AI of Genesis.
    AIVA uses Claude Code for coding tasks. Genesis uses PostgreSQL for storage.
    The system can process documents and extract knowledge automatically.
    Redis is used for real-time caching. Qdrant handles vector embeddings.
    """

    result = pipeline.ingest_text(sample_text, "sample_document")

    print(f"\nIngestion Result:")
    print(f"  Document ID: {result.document_id}")
    print(f"  Success: {result.success}")

    print(f"\nEntities ({len(result.entities)}):")
    for e in result.entities[:5]:
        print(f"  [{e.entity_type}] {e.name}")

    print(f"\nAxioms ({len(result.axioms)}):")
    for a in result.axioms[:5]:
        print(f"  {a.subject} {a.predicate} {a.object}")

    print(f"\nRelationships ({len(result.relationships)}):")
    for r in result.relationships[:5]:
        print(f"  {r.source_entity} --{r.relationship_type}--> {r.target_entity}")

    print(f"\nStats: {pipeline.get_stats()}")