#!/usr/bin/env python3
"""
LightRAG Corpus Intelligence Layer
====================================
Indexes all Genesis deep_think files, KG axioms, and strategic documents.
Enables global synthesis queries over the full Genesis knowledge corpus.

Usage:
    # Test index (5 files):
    python3 core/lightrag_corpus.py --test

    # Full index (runs everything):
    python3 core/lightrag_corpus.py --full

    # Query:
    python3 core/lightrag_corpus.py --query "What are the dominant strategic patterns?"

    # List indexable files:
    python3 core/lightrag_corpus.py --list-files
"""

import os
import sys
import json
import asyncio
from pathlib import Path
from functools import partial

# Genesis root
GENESIS_ROOT = Path("/mnt/e/genesis-system")
LIGHTRAG_INDEX_DIR = GENESIS_ROOT / "KNOWLEDGE_GRAPH" / "lightrag_index"
LIGHTRAG_INDEX_DIR.mkdir(parents=True, exist_ok=True)

# Source directories for corpus indexing (priority order)
CORPUS_DIRS = [
    GENESIS_ROOT / "KNOWLEDGE_GRAPH",
    GENESIS_ROOT / "plans",
    GENESIS_ROOT / "deep_think",
    GENESIS_ROOT / "deep_think_results",
    GENESIS_ROOT / "core",
    GENESIS_ROOT / "Sunaiva",
    GENESIS_ROOT / "docs",
    GENESIS_ROOT / "Conversations",
    GENESIS_ROOT / "RECEPTIONISTAI" / "strategy",
    GENESIS_ROOT / "RECEPTIONISTAI" / "research",
    GENESIS_ROOT / "RECEPTIONISTAI" / "verticals",
    GENESIS_ROOT / "RECEPTIONISTAI" / "swarm_generated",
    GENESIS_ROOT / "TRADIES",
    GENESIS_ROOT / "prompts",
    GENESIS_ROOT / "n8n",
    GENESIS_ROOT / "hive" / "swarm_results",
    GENESIS_ROOT / "data" / "agents" / "forks",
    GENESIS_ROOT / "mcp-servers",
    GENESIS_ROOT / "loop" / "agents",
    GENESIS_ROOT / "Mission Briefs",
]

# Directories to skip during rglob traversal
# NOTE: "generated" excludes docs/plans/generated which contains ~40k auto-generated PRD stubs
SKIP_DIRS = {
    "node_modules", "__pycache__", ".venv", "venv",
    "dist", "build", ".git", "lightrag_index", "generated",
}

# File extensions to include
INCLUDE_EXTENSIONS = {".md", ".txt", ".py", ".json"}


def get_corpus_files(max_files: int = None) -> list:
    """Gather all indexable corpus files from CORPUS_DIRS plus root-level .md files."""
    files = []
    seen = set()

    def _add(fp: Path):
        """Add a file to the list if it passes all filters."""
        if "lightrag_index" in str(fp):
            return
        if any(part.startswith(".") for part in fp.parts):
            return
        if any(part in SKIP_DIRS for part in fp.parts):
            return
        abs_path = str(fp.resolve())
        if abs_path in seen:
            return
        seen.add(abs_path)
        files.append(fp)

    # Recursively collect from each corpus directory
    for d in CORPUS_DIRS:
        if not d.exists():
            continue
        for ext in INCLUDE_EXTENSIONS:
            for fp in d.rglob(f"*{ext}"):
                _add(fp)
                if max_files and len(files) >= max_files:
                    return files

    # Also collect root-level .md files directly in GENESIS_ROOT (non-recursive)
    for fp in GENESIS_ROOT.glob("*.md"):
        _add(fp)
        if max_files and len(files) >= max_files:
            return files

    return files


def read_file_safe(fp: Path) -> str:
    """Read a file safely, returning empty string on failure."""
    try:
        return fp.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return ""


def build_rag():
    """Build and return a LightRAG instance pointed at our index directory."""
    try:
        from lightrag import LightRAG, QueryParam
        from lightrag.llm.gemini import gemini_complete_if_cache
    except ImportError as e:
        print(f"ERROR: lightrag-hku not importable: {e}")
        print("Install with: /mnt/e/genesis-system/.venv/bin/pip install lightrag-hku")
        sys.exit(1)

    api_key = (
        os.environ.get("GEMINI_API_KEY_NEW")
        or os.environ.get("GEMINI_API_KEY")
        or os.environ.get("GOOGLE_API_KEY")
    )
    if not api_key:
        print("WARNING: No Gemini API key found. Set GEMINI_API_KEY_NEW or GEMINI_API_KEY.")

    # Wrap gemini_complete_if_cache as the LightRAG llm_model_func
    async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
        return await gemini_complete_if_cache(
            model="gemini-2.0-flash",
            prompt=prompt,
            system_prompt=system_prompt,
            history_messages=history_messages or [],
            api_key=api_key,
            **kwargs,
        )

    # Custom embedding using google.genai (gemini-embedding-001, 3072d)
    # Bypasses lightrag's built-in gemini_embed which has a batch-doubling bug
    from lightrag.utils import EmbeddingFunc
    import numpy as np
    from google import genai as _google_genai
    _gclient = _google_genai.Client(api_key=api_key)

    async def embedding_func_with_key(texts: list) -> np.ndarray:
        vectors = []
        for text in texts:
            result = _gclient.models.embed_content(
                model="gemini-embedding-001",
                contents=text,
            )
            vectors.append(list(result.embeddings[0].values))
        return np.array(vectors, dtype=np.float32)

    embed_func = EmbeddingFunc(
        embedding_dim=3072,
        max_token_size=2048,
        func=embedding_func_with_key,
    )

    rag = LightRAG(
        working_dir=str(LIGHTRAG_INDEX_DIR),
        llm_model_func=llm_model_func,
        embedding_func=embed_func,
    )
    return rag, QueryParam


async def init_rag(rag):
    """Initialize LightRAG storages (required for v1.4+)."""
    await rag.initialize_storages()
    return rag


async def index_files(files: list, rag) -> dict:
    """Insert files into LightRAG index. Returns stats."""
    # Initialize storages (required for LightRAG v1.4+)
    await rag.initialize_storages()
    stats = {"indexed": 0, "skipped": 0, "errors": []}
    for fp in files:
        content = read_file_safe(fp)
        if len(content.strip()) < 50:
            stats["skipped"] += 1
            continue
        try:
            doc = f"[FILE: {fp}]\n\n{content}"
            await rag.ainsert(doc)
            stats["indexed"] += 1
            print(f"  Indexed: {fp.name} ({len(content):,} chars)")
        except Exception as e:
            stats["errors"].append(f"{fp.name}: {e}")
            print(f"  ERROR indexing {fp.name}: {e}")
    return stats


async def query_corpus(query: str, mode: str = "global") -> str:
    """
    Run a synthesis query over the indexed corpus.

    Modes:
        global  - Synthesizes across entire corpus (best for patterns/strategy)
        local   - Focuses on most relevant entities
        hybrid  - Combines both
        naive   - Simple similarity search
    """
    rag, QueryParam = build_rag()
    await rag.initialize_storages()
    result = await rag.aquery(query, param=QueryParam(mode=mode))
    return result


async def run_test_index():
    """Index first 5 corpus files and run a test query."""
    print("=== LightRAG Test Index (5 files) ===")
    files = get_corpus_files(max_files=5)
    print(f"Selected {len(files)} files:")
    for f in files:
        print(f"  - {f}")

    rag, QueryParam = build_rag()
    stats = await index_files(files, rag)
    print(f"\nIndex stats: {json.dumps(stats, indent=2)}")

    if stats["indexed"] > 0:
        print("\n=== Test Query ===")
        print("Query: What is Genesis System and what does it do?")
        try:
            result = await rag.aquery(
                "What is Genesis System and what does it do?",
                param=QueryParam(mode="naive")
            )
            print(f"Result:\n{result}")
        except Exception as e:
            print(f"Query error: {e}")

    return stats


async def run_full_index():
    """Index all corpus files."""
    print("=== LightRAG Full Corpus Index ===")
    files = get_corpus_files()
    print(f"Total files to index: {len(files)}")

    rag, QueryParam = build_rag()
    stats = await index_files(files, rag)

    manifest = {
        "total_files": len(files),
        "indexed": stats["indexed"],
        "skipped": stats["skipped"],
        "errors": stats["errors"],
        "index_dir": str(LIGHTRAG_INDEX_DIR),
    }
    manifest_path = LIGHTRAG_INDEX_DIR / "index_manifest.json"
    manifest_path.write_text(json.dumps(manifest, indent=2))
    print(f"\nManifest saved: {manifest_path}")
    print(f"Final stats: {json.dumps(stats, indent=2)}")
    return stats


def main():
    import argparse
    parser = argparse.ArgumentParser(description="LightRAG Corpus Intelligence Layer")
    parser.add_argument("--test", action="store_true", help="Index 5 files and run test query")
    parser.add_argument("--full", action="store_true", help="Full corpus index")
    parser.add_argument("--query", type=str, help="Run a synthesis query")
    parser.add_argument("--mode", type=str, default="global",
                        choices=["global", "local", "hybrid", "naive"],
                        help="Query mode (default: global)")
    parser.add_argument("--list-files", action="store_true", help="List all corpus files")
    args = parser.parse_args()

    if args.list_files:
        files = get_corpus_files()
        print(f"Total corpus files: {len(files)}")
        for f in files:
            print(f"  {f}")
        return

    if args.test:
        asyncio.run(run_test_index())
    elif args.full:
        asyncio.run(run_full_index())
    elif args.query:
        result = asyncio.run(query_corpus(args.query, mode=args.mode))
        print(f"\n[{args.mode.upper()} QUERY]: {args.query}\n")
        print(result)
    else:
        parser.print_help()


if __name__ == "__main__":
    main()