# knowledge_graph_engine.py
import json
import os
from pathlib import Path
from typing import List, Dict, Any, Optional
import networkx as nx
from collections import defaultdict
import re

class KnowledgeGraphEngine:
    """
    Advanced Knowledge Graph Engine with deduplication, updates, and text extraction.
    """
    def __init__(self, workspace_path: str = "aiva_kg"):
        self.workspace = Path(workspace_path)
        self.kg_dir = self.workspace / "knowledge_graph"
        self.entities_path = self.kg_dir / "entities.jsonl"
        self.relationships_path = self.kg_dir / "relationships.jsonl"
        self.graph = nx.Graph()  # Or nx.DiGraph() for directed graph
        self.entity_ids = set() # Store existing entity IDs for deduplication
        self.entity_dedupe_fields = ["type"] # Fields to consider when deduplicating entities
        os.makedirs(self.kg_dir, exist_ok=True)
        self._load_graph()

    def _load_graph(self):
        """Loads entities and relationships into a NetworkX graph."""
        if not self.entities_path.exists() or not self.relationships_path.exists():
            return

        # Load Entities
        with open(self.entities_path, "r", encoding="utf-8") as f:
            for line in f:
                entity = json.loads(line)
                self.add_entity(entity, load=True) # Use add_entity for deduplication during load

        # Load Relationships
        with open(self.relationships_path, "r", encoding="utf-8") as f:
            for line in f:
                rel = json.loads(line)
                self.add_relationship(rel)


    def add_entity(self, entity: Dict[str, Any], load: bool = False):
        """Adds an entity to the graph, handling deduplication."""
        entity_id = entity["id"]

        if entity_id in self.entity_ids and not load: # Deduplication check.  Skip if loading
            # Check if existing entity is the same.  If so, skip.
            existing_entity = self.graph.nodes[entity_id]
            if all(entity.get(field, None) == existing_entity.get(field, None) for field in self.entity_dedupe_fields):
                return # Skip if same.
            else:
                # Update the entity with the new information.
                self.graph.nodes[entity_id].update(entity)
                return # Updated existing entity.


        self.graph.add_node(entity_id, **entity)
        self.entity_ids.add(entity_id)

        if not load:
            self._append_jsonl(self.entities_path, entity)


    def add_relationship(self, relationship: Dict[str, Any]):
        """Adds a relationship to the graph."""
        from_id = relationship["from"]
        to_id = relationship["to"]

        if from_id not in self.entity_ids or to_id not in self.entity_ids:
            print(f"Warning: Skipping relationship from {from_id} to {to_id} because one or both entities do not exist.")
            return

        self.graph.add_edge(from_id, to_id, **relationship)
        self._append_jsonl(self.relationships_path, relationship)

    def _append_jsonl(self, file_path: Path, data: Dict[str, Any]):
        """Appends a JSON object to a .jsonl file."""
        with open(file_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(data) + "\n")

    def extract_entities_from_text(self, text: str) -> List[Dict[str, Any]]:
        """Placeholder for extracting entities from unstructured text using NLP."""
        # This is a simplified example; a real implementation would use NLP techniques.
        entities = []
        # Example: Extract potential entity IDs based on a pattern.
        potential_ids = re.findall(r"[A-Z]+_[a-zA-Z0-9-]+", text) # Match IDs like PROTOCOL_VERIFICATION_FIRST
        for entity_id in potential_ids:
            entities.append({"id": entity_id, "type": "unknown", "source": "text_extraction"}) # Minimal entity

        # Example: Extract tool names (very basic)
        tools = re.findall(r"(Gemini|ChatGPT|SORA|GoHighLevel|Telnyx|Instantly\.ai|Apify|Pomelli|Veo|n8n|PostgreSQL|Redis|Qdrant|ElevenLabs)", text)
        for tool in tools:
            entities.append({"id": f"TOOL_{tool}", "type": "tool", "name": tool, "source": "text_extraction"})

        return entities

    def query_paths(self, start_entity: str, end_entity: str) -> List[List[str]]:
        """Finds all paths between two entities in the graph."""
        try:
            paths = list(nx.all_simple_paths(self.graph, source=start_entity, target=end_entity))
            return paths
        except nx.NetworkXNoPath:
            return []
        except nx.NodeNotFound:
            return []

    def get_entity(self, entity_id: str) -> Optional[Dict[str, Any]]:
        """Retrieves an entity from the graph by its ID."""
        if entity_id in self.graph.nodes:
            return self.graph.nodes[entity_id]
        else:
            return None

    def multi_hop_search(self, start_entity: str, max_hops: int = 2) -> List[Dict]:
        """
        Performs a multi-hop search to find deep structural relationships.
        """
        if start_entity not in self.graph:
            return []

        results = []
        visited = {start_entity}
        queue = [(start_entity, 0)]

        while queue:
            node_id, hop = queue.pop(0)
            if hop >= max_hops:
                continue

            for neighbor in self.graph.neighbors(node_id):
                if neighbor not in visited:
                    visited.add(neighbor)
                    edge_data = self.graph.get_edge_data(node_id, neighbor)
                    node_data = self.graph.nodes[neighbor]

                    results.append({
                        "from": node_id,
                        "to": neighbor,
                        "hop": hop + 1,
                        "relationship": edge_data.get("type", "related"),
                        "entity_type": node_data.get("type", "unknown")
                    })
                    queue.append((neighbor, hop + 1))

        return results

    def hybrid_retrieve(self, query: str, vector_results: List[str]) -> List[Dict]:
        """
        Combines vector search results with structural graph neighbours.
        This provides BOTH semantic similarity AND relational context.
        """
        hybrid_results = []
        for entity_id in vector_results:
            if entity_id in self.graph:
                # Add the entity itself
                hybrid_results.append({
                    "id": entity_id,
                    "data": self.graph.nodes[entity_id],
                    "source": "vector"
                })
                # Add immediate neighbors (1-hop) as context
                for neighbor in self.graph.neighbors(entity_id):
                    hybrid_results.append({
                        "id": neighbor,
                        "data": self.graph.nodes[neighbor],
                        "source": "graph_expansion",
                        "related_to": entity_id
                    })
        return hybrid_results

if __name__ == "__main__":
    # Example Usage
    kg_engine = KnowledgeGraphEngine()

    # Add some entities
    entity1 = {"id": "PERSON_1", "type": "Person", "name": "Alice"}
    entity2 = {"id": "CONCEPT_1", "type": "Concept", "name": "Knowledge Graph"}
    entity3 = {"id": "TOOL_1", "type": "Tool", "name": "NetworkX"}

    kg_engine.add_entity(entity1)
    kg_engine.add_entity(entity2)
    kg_engine.add_entity(entity3)

    # Add a relationship
    relationship1 = {"from": "PERSON_1", "to": "CONCEPT_1", "type": "knows_about"}
    relationship2 = {"from": "PERSON_1", "to": "TOOL_1", "type": "uses"}

    kg_engine.add_relationship(relationship1)
    kg_engine.add_relationship(relationship2)

    # Query for paths
    paths = kg_engine.query_paths("PERSON_1", "CONCEPT_1")
    print(f"Paths from PERSON_1 to CONCEPT_1: {paths}")

    # Extract entities from text
    text = "Alice uses NetworkX and is interested in Knowledge Graph.  Also, PROTOCOL_VERIFICATION_FIRST is important."
    extracted_entities = kg_engine.extract_entities_from_text(text)
    print(f"Extracted Entities: {extracted_entities}")
    for entity in extracted_entities:
        kg_engine.add_entity(entity) # Add extracted entities to the KG

    # Deduplication Test: Add the same entity again
    kg_engine.add_entity(entity1)
    entity1_updated = {"id": "PERSON_1", "type": "Person", "name": "Alice Smith", "age": 30}
    kg_engine.add_entity(entity1_updated)
    print(f"Entity Person_1: {kg_engine.get_entity('PERSON_1')}")

    # Multi-hop search
    print(f"Multi-hop search from PERSON_1: {json.dumps(kg_engine.multi_hop_search('PERSON_1'), indent=2)}")

    #Hybrid Retrieve
    vector_results = ["PERSON_1", "CONCEPT_1"]
    print(f"Hybrid Retrieve with vector results {vector_results}: {json.dumps(kg_engine.hybrid_retrieve('some query', vector_results), indent=2)}")

    print(f"Graph loaded with {kg_engine.graph.number_of_nodes()} nodes and {kg_engine.graph.number_of_edges()} edges.")