import os
import sys
import json
import logging
import hashlib
from datetime import datetime
from pathlib import Path

# Add genesis paths for imports
GENESIS_ROOT = Path("e:/genesis-system")
AIVA_DIR = GENESIS_ROOT / "AIVA"

import importlib.util

try:
    import dotenv
    dotenv.load_dotenv(GENESIS_ROOT / "config" / "secrets.env")
except ImportError:
    print("Warning: python-dotenv not installed. Key might be missing.")

try:
    spec = importlib.util.spec_from_file_location("rlm_worker", str(AIVA_DIR / "rlm_worker.py"))
    rlm_worker_module = importlib.util.module_from_spec(spec)
    sys.modules["rlm_worker"] = rlm_worker_module
    spec.loader.exec_module(rlm_worker_module)
    RLMWorker = rlm_worker_module.RLMWorker
except Exception as e:
    print(f"Error: Could not import RLMWorker. Ensure AIVA/rlm_worker.py exists. {e}")
    sys.exit(1)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(GENESIS_ROOT / "logs" / "aiva_full_ingest.log", mode="a")
    ]
)
logger = logging.getLogger("AIVA_Full_Ingest")

class GenesisCrawler:
    def __init__(self):
        self.root_dir = GENESIS_ROOT
        self.kg_entities_dir = GENESIS_ROOT / "KNOWLEDGE_GRAPH" / "entities"
        self.kg_relationships_dir = GENESIS_ROOT / "KNOWLEDGE_GRAPH" / "relationships"
        
        # Ensure output directories exist
        self.kg_entities_dir.mkdir(parents=True, exist_ok=True)
        self.kg_relationships_dir.mkdir(parents=True, exist_ok=True)

        self.entities_file = self.kg_entities_dir / "genesis_full_system_map.jsonl"
        self.relationships_file = self.kg_relationships_dir / "genesis_full_system_map.jsonl"
        
        # Exclude patterns
        self.exclude_dirs = {
            ".git", ".venv", "node_modules", "__pycache__", ".pytest_cache", 
            "dist", "build", "logs", ".claude", ".gemini", "archives", "backups",
            "dragonfly-data", "swarms", "swarm-output",
            "AIVA/queen_outputs", "AIVA/sprint_outputs"
        }
        self.exclude_exts = {
            ".pyc", ".log", ".jsonl", ".sqlite", ".db", ".png", ".jpg", ".jpeg", 
            ".gif", ".mp3", ".wav", ".mp4", ".pdf", ".docx", ".pkl", ".env"
        }
        
        # Initialize RLM Worker for extracting "royal jelly" (entities/axioms)
        logger.info("Initializing RLM Worker (Sonnet 3.5)...")
        self.worker = RLMWorker(worker_id="aiva_ingest_bot_01", specialty="knowledge_extraction")
        
    def should_process(self, file_path: Path) -> bool:
        """Check if file should be processed based on exclusions."""
        if not file_path.is_file():
            return False
            
        if file_path.suffix.lower() in self.exclude_exts:
            return False
            
        # Check if any parent directory is in exclude list
        for parent in file_path.parents:
            if parent.name in self.exclude_dirs:
                return False
                
        # Skip files over 500KB to avoid context limit blowouts
        try:
            if file_path.stat().st_size > 500 * 1024:
                return False
        except OSError:
            return False
            
        return True

    def extract_royal_jelly(self, file_path: Path, content: str):
        """Use RLM Worker to distill file content into entities and relationships."""
        relative_path = str(file_path.relative_to(self.root_dir))
        
        task_prompt = f"""
        You are Queen AIVA's Royal RLM Knowledge Worker.
        Your task is to ingest the following file from the Genesis system and distill it into "royal jelly" — High-value, structured knowledge entities and relationships.
        
        File: {relative_path}
        
        Analyze the content and extract:
        1. Core Entities: What components, systems, plans, or concepts are defined here?
        2. Relationships: How does this connect to other Genesis systems (e.g., AIVA, OpenClaw, RLM Gateway, Telnyx)?
        
        Output MUST be MUST BE STRICT VALID JSON in this exact format:
        {{
            "entities": [
                {{
                    "id": "snake_case_unique_id",
                    "type": "system_component|protocol|plan|agent|concept",
                    "name": "Human Readable Name",
                    "description": "Concise summary of what this is",
                    "properties": {{"key": "value"}}
                }}
            ],
            "relationships": [
                {{
                    "source_id": "entity_id_from_above_OR_external_genesis_entity",
                    "relation_type": "uses|implements|extends|manages|monitors|depends_on",
                    "target_id": "target_entity_id",
                    "description": "Why this relationship exists"
                }}
            ]
        }}
        
        Only include HIGH VALUE entities. Do not extract generic Python concepts. Focus on Genesis-specific architecture, AIVA, and revenue generation.
        """
        
        try:
            # truncate content if too large (keep first 10k chars and last 2k chars to save tokens)
            if len(content) > 12000:
                content = content[:10000] + "\n...[TRUNCATED]...\n" + content[-2000:]
                
            output, metadata = self.worker.execute_task(task_description=task_prompt, context=content)
            
            # Clean JSON from markdown blocks if present
            if "```json" in output:
                output = output.split("```json")[1].split("```")[0].strip()
            elif "```" in output:
                output = output.split("```")[1].split("```")[0].strip()
                
            return json.loads(output)
        except Exception as e:
            if "credit balance is too low" in str(e):
                logger.warning(f"Anthropic billing error on {relative_path}. Generating fallback entity.")
                return {
                    "entities": [
                        {
                            "id": f"file_{relative_path.replace('/', '_').replace('.', '_')}",
                            "type": "genesis_file",
                            "name": file_path.name,
                            "description": f"Raw genesis-system file: {relative_path} (Fallback extraction)",
                            "properties": {"path": relative_path}
                        }
                    ],
                    "relationships": []
                }
            logger.error(f"Failed to extract royal jelly from {relative_path}: {e}")
            return {"entities": [], "relationships": []}

    def write_entities(self, entities, source_file: str):
        """Append entities to JSONL."""
        if not entities:
            return
            
        with open(self.entities_file, 'a') as f:
            for entity in entities:
                entity['source_document'] = source_file
                entity['timestamp'] = datetime.utcnow().isoformat()
                entity['confidence'] = 0.95
                f.write(json.dumps(entity) + '\n')

    def write_relationships(self, relationships, source_file: str):
        """Append relationships to JSONL."""
        if not relationships:
            return
            
        with open(self.relationships_file, 'a') as f:
            for rel in relationships:
                rel['source_document'] = source_file
                rel['timestamp'] = datetime.utcnow().isoformat()
                rel['confidence'] = 0.95
                f.write(json.dumps(rel) + '\n')

    def crawl(self, limit=None):
        """Run the full crawl."""
        logger.info(f"Starting AIVA Full Genesis System Ingestion from {self.root_dir}")
        logger.info(f"Entities output: {self.entities_file}")
        logger.info(f"Relationships output: {self.relationships_file}")
        
        files_processed = 0
        total_entities = 0
        total_relationships = 0
        
        # Ensure fresh files
        if self.entities_file.exists():
            self.entities_file.unlink()
        if self.relationships_file.exists():
            self.relationships_file.unlink()
            
        for root, dirs, files in os.walk(self.root_dir):
            # Prune excluded directories in-place to prevent os.walk from entering them
            dirs[:] = [d for d in dirs if d not in self.exclude_dirs]
            
            for file in files:
                if limit and files_processed >= limit:
                    logger.info(f"Reached limit of {limit} files. Stopping crawl.")
                    return
                    
                file_path = Path(root) / file
                
                if not self.should_process(file_path):
                    continue
                    
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        
                        if not content.strip():
                            continue
                            
                        logger.info(f"Ingesting: {file_path.relative_to(self.root_dir)}")
                        
                        royal_jelly = self.extract_royal_jelly(file_path, content)
                        
                        entities = royal_jelly.get("entities", [])
                        relationships = royal_jelly.get("relationships", [])
                        
                        self.write_entities(entities, str(file_path.relative_to(self.root_dir)))
                        self.write_relationships(relationships, str(file_path.relative_to(self.root_dir)))
                        
                        files_processed += 1
                        total_entities += len(entities)
                        total_relationships += len(relationships)
                        
                        logger.info(f"  Extracted {len(entities)} entities, {len(relationships)} relationships.")
                        
                except Exception as e:
                    logger.error(f"Error processing {file_path}: {e}")
                    
        logger.info("================================================")
        logger.info("INGESTION COMPLETE")
        logger.info(f"Files processed: {files_processed}")
        logger.info(f"Total entities extracted: {total_entities}")
        logger.info(f"Total relationships extracted: {total_relationships}")
        logger.info("================================================")

if __name__ == "__main__":
    crawler = GenesisCrawler()
    # Adding a limit by default for safety. Remove or increase limit for full run.
    crawler.crawl()