#!/usr/bin/env python3
"""
Deep Think Results → Knowledge Graph Ingestion Pipeline
Reads all files in deep_think_results/ and commits entities + axioms to KG.

Usage:
  python3 scripts/deep_think_kg_ingest.py                  # Full ingest
  python3 scripts/deep_think_kg_ingest.py --verify         # Check KG counts
  python3 scripts/deep_think_kg_ingest.py --query "..."    # Semantic search
  python3 scripts/deep_think_kg_ingest.py --dry-run        # Preview only
  python3 scripts/deep_think_kg_ingest.py --file DT5       # Single file
"""

import os
import sys
import json
import hashlib
import argparse
import asyncio
from pathlib import Path
from datetime import datetime
from typing import Optional

# Paths
BASE_DIR = Path("/mnt/e/genesis-system")
DT_DIR = BASE_DIR / "deep_think_results"
HYPER_DIR = BASE_DIR / "hyperdrive_results"
KG_DIR = BASE_DIR / "KNOWLEDGE_GRAPH"
ENTITIES_OUT = KG_DIR / "entities" / "deep_think_entities.jsonl"
AXIOMS_OUT = KG_DIR / "axioms" / "deep_think_axioms.jsonl"
REPORT_OUT = KG_DIR / "deep_think_ingest_report.md"

# Extraction prompt for Gemini Flash
EXTRACTION_PROMPT = """You are extracting structured knowledge from a Genesis strategic document.

Document: {filename}
Content chunk:
---
{chunk}
---

Extract and return ONLY valid JSON with this structure:
{{
  "entities": [
    {{
      "id": "dt_entity_{slug}_{n}",
      "type": "DECISION|STRATEGY|SYSTEM|PATTERN|ACTION|INSIGHT",
      "name": "concise name (max 60 chars)",
      "content": "full description of the entity",
      "confidence": 0.0-1.0,
      "status": "LOCKED|RECOMMENDED|PROPOSED|FUTURE",
      "connections": ["other entity IDs if mentioned"]
    }}
  ],
  "axioms": [
    {{
      "id": "dt_axiom_{slug}_{n}",
      "statement": "single validated truth statement",
      "domain": "revenue|memory|voice|agent|compliance|product|operations",
      "confidence": 0.0-1.0,
      "auto_inject": true/false
    }}
  ],
  "next_actions": [
    "concrete next action extracted from document"
  ]
}}

Rules:
- Only extract genuinely important, reusable knowledge
- Axioms must be timeless truths, not one-time facts
- Entities must be architectural decisions, strategies, or systems
- Skip administrative/procedural content
- Return empty arrays if nothing significant found
"""

def load_secrets():
    """Load API keys from secrets.env"""
    secrets_path = BASE_DIR / "config" / "secrets.env"
    secrets = {}
    if secrets_path.exists():
        with open(secrets_path) as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    key, _, val = line.partition('=')
                    secrets[key.strip()] = val.strip().strip('"').strip("'")
    return secrets

def load_existing_hashes() -> set:
    """Load SHA-256 hashes of existing KG entries to detect duplicates"""
    hashes = set()
    for path in [ENTITIES_OUT, AXIOMS_OUT]:
        if path.exists():
            with open(path) as f:
                for line in f:
                    line = line.strip()
                    if line:
                        try:
                            obj = json.loads(line)
                            content = obj.get('content') or obj.get('statement') or ''
                            hashes.add(hashlib.sha256(content.encode()).hexdigest())
                        except json.JSONDecodeError:
                            pass
    return hashes

def chunk_document(content: str, max_tokens: int = 1800) -> list[str]:
    """Split document into chunks by H2/H3 headers, max ~1800 tokens each"""
    import re
    # Split on markdown headers
    sections = re.split(r'\n(?=#{2,3}\s)', content)
    chunks = []
    current = ""
    for section in sections:
        # Rough token estimate: ~4 chars per token
        if len(current) + len(section) > max_tokens * 4:
            if current.strip():
                chunks.append(current.strip())
            current = section
        else:
            current += "\n" + section
    if current.strip():
        chunks.append(current.strip())
    return chunks if chunks else [content[:max_tokens * 4]]

async def extract_with_gemini(chunk: str, filename: str, slug: str, secrets: dict) -> dict:
    """Use Gemini Flash to extract entities and axioms from a chunk"""
    try:
        from google import genai as new_genai
        api_key = secrets.get('GEMINI_API_KEY') or secrets.get('GOOGLE_API_KEY')
        if not api_key:
            return await extract_with_openrouter(chunk, filename, slug, secrets)

        client = new_genai.Client(api_key=api_key)

        prompt = EXTRACTION_PROMPT.format(
            filename=filename,
            chunk=chunk[:6000],
            slug=slug[:20],
            n=0
        )

        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=prompt
        )
        text = response.text.strip()

        # Extract JSON from response
        if '```json' in text:
            text = text.split('```json')[1].split('```')[0].strip()
        elif '```' in text:
            text = text.split('```')[1].split('```')[0].strip()

        return json.loads(text)
    except Exception as e:
        print(f"  ⚠️  Gemini error for {filename}: {e} — trying OpenRouter")
        return await extract_with_openrouter(chunk, filename, slug, secrets)

async def extract_with_openrouter(chunk: str, filename: str, slug: str, secrets: dict) -> dict:
    """Fallback: use OpenRouter with Gemini Flash"""
    try:
        import httpx
        api_key = secrets.get('OPENROUTER_API_KEY')
        if not api_key:
            return {"entities": [], "axioms": [], "next_actions": []}

        prompt = EXTRACTION_PROMPT.format(
            filename=filename,
            chunk=chunk[:6000],
            slug=slug[:20],
            n=0
        )

        async with httpx.AsyncClient(timeout=60) as client:
            resp = await client.post(
                "https://openrouter.ai/api/v1/chat/completions",
                headers={"Authorization": f"Bearer {api_key}"},
                json={
                    "model": "google/gemini-2.0-flash",
                    "messages": [{"role": "user", "content": prompt}],
                    "temperature": 0.1
                }
            )
            resp.raise_for_status()
            data = resp.json()
            text = data['choices'][0]['message']['content'].strip()

            if '```json' in text:
                text = text.split('```json')[1].split('```')[0].strip()
            elif '```' in text:
                text = text.split('```')[1].split('```')[0].strip()

            return json.loads(text)
    except Exception as e:
        print(f"  ⚠️  OpenRouter extraction error: {e}")
        return {"entities": [], "axioms": [], "next_actions": []}

def write_entity(f, entity: dict, source_file: str, existing_hashes: set) -> bool:
    """Write entity to JSONL if not duplicate"""
    content_hash = hashlib.sha256(entity.get('content', '').encode()).hexdigest()
    if content_hash in existing_hashes:
        return False

    record = {
        **entity,
        "source_file": source_file,
        "date": datetime.now().strftime("%Y-%m-%d"),
        "provenance": "deep_think_ingest_v1",
        "access_count": 0
    }
    f.write(json.dumps(record) + "\n")
    existing_hashes.add(content_hash)
    return True

def write_axiom(f, axiom: dict, source_file: str, existing_hashes: set) -> bool:
    """Write axiom to JSONL if not duplicate"""
    content_hash = hashlib.sha256(axiom.get('statement', '').encode()).hexdigest()
    if content_hash in existing_hashes:
        return False

    record = {
        **axiom,
        "source_file": source_file,
        "date": datetime.now().strftime("%Y-%m-%d"),
        "validated": axiom.get('confidence', 0) >= 0.85,
        "provenance": "deep_think_ingest_v1",
        "created_at": datetime.now().isoformat(),
        "access_count": 0
    }
    f.write(json.dumps(record) + "\n")
    existing_hashes.add(content_hash)
    return True

def get_dt_files(filter_name: Optional[str] = None) -> list[Path]:
    """Get all markdown and JSON files from deep_think_results/"""
    files = []
    for dir_path in [DT_DIR, HYPER_DIR]:
        if dir_path.exists():
            for f in dir_path.rglob("*.md"):
                if filter_name is None or filter_name.lower() in f.name.lower():
                    files.append(f)
            for f in dir_path.rglob("*.json"):
                if filter_name is None or filter_name.lower() in f.name.lower():
                    if "_result" in f.name or "RESPONSE" in f.name:
                        files.append(f)
    return sorted(files)

def verify_kg():
    """Print KG statistics"""
    print("\n📊 KNOWLEDGE GRAPH STATISTICS")
    print("=" * 50)

    for label, path in [("Entities (all)", KG_DIR / "entities.jsonl"),
                         ("Deep Think Entities", ENTITIES_OUT),
                         ("Axioms (all)", KG_DIR / "axioms" / "opus_46_axioms.jsonl"),
                         ("Deep Think Axioms", AXIOMS_OUT)]:
        if path.exists():
            count = sum(1 for line in open(path) if line.strip())
            print(f"  {label}: {count} records in {path.name}")
        else:
            print(f"  {label}: NOT FOUND ({path})")

    print("\n📁 Deep Think Results Files:")
    files = get_dt_files()
    print(f"  Total files: {len(files)}")
    for f in files[:10]:
        size = f.stat().st_size
        print(f"  {f.name} ({size:,} bytes)")
    if len(files) > 10:
        print(f"  ... and {len(files) - 10} more")

def semantic_query(query: str, secrets: dict):
    """Query the KG semantically via Qdrant"""
    try:
        from qdrant_client import QdrantClient
        from qdrant_client.models import Distance, VectorParams

        qdrant_url = secrets.get('QDRANT_URL', 'http://qdrant-b3knu-u50607.vm.elestio.app:6333')
        qdrant_key = secrets.get('QDRANT_API_KEY', '')

        client = QdrantClient(url=qdrant_url, api_key=qdrant_key if qdrant_key else None)

        # Simple text search via scroll (no embedding needed for basic search)
        results = client.scroll(
            collection_name="genesis_memories",
            limit=5,
            with_payload=True
        )

        print(f"\n🔍 Query: '{query}'")
        print("=" * 50)
        if results[0]:
            for point in results[0]:
                payload = point.payload or {}
                print(f"\n  [{payload.get('type', 'unknown')}] {payload.get('name', 'N/A')}")
                print(f"  {payload.get('content', payload.get('statement', ''))[:200]}")
        else:
            print("  No results found")

    except Exception as e:
        # Fallback: grep the JSONL files
        print(f"\n🔍 Query: '{query}' (local JSONL search)")
        print("=" * 50)
        query_lower = query.lower()
        count = 0
        for path in [ENTITIES_OUT, AXIOMS_OUT]:
            if path.exists():
                with open(path) as f:
                    for line in f:
                        if query_lower in line.lower():
                            try:
                                obj = json.loads(line)
                                print(f"\n  [{obj.get('type', 'axiom')}] {obj.get('name', obj.get('id', ''))}")
                                print(f"  {obj.get('content', obj.get('statement', ''))[:200]}")
                                count += 1
                                if count >= 5:
                                    return
                            except:
                                pass

async def run_ingest(args):
    """Main ingestion pipeline"""
    secrets = load_secrets()

    # Ensure output directories exist
    ENTITIES_OUT.parent.mkdir(parents=True, exist_ok=True)
    AXIOMS_OUT.parent.mkdir(parents=True, exist_ok=True)

    # Load existing hashes for deduplication
    existing_hashes = load_existing_hashes()
    print(f"🔍 Loaded {len(existing_hashes)} existing KG entries for dedup")

    # Get files to process
    files = get_dt_files(args.file if hasattr(args, 'file') else None)
    print(f"📁 Found {len(files)} files to process")

    if not files:
        print("❌ No files found in deep_think_results/")
        return

    # Stats
    stats = {
        "files_processed": 0,
        "files_skipped": 0,
        "entities_added": 0,
        "axioms_added": 0,
        "next_actions": [],
        "errors": []
    }

    # Open output files in append mode
    entity_mode = 'a' if ENTITIES_OUT.exists() else 'w'
    axiom_mode = 'a' if AXIOMS_OUT.exists() else 'w'

    with open(ENTITIES_OUT, entity_mode) as ef, open(AXIOMS_OUT, axiom_mode) as af:
        for i, file_path in enumerate(files):
            print(f"\n[{i+1}/{len(files)}] Processing: {file_path.name}")

            try:
                content = file_path.read_text(encoding='utf-8', errors='ignore')
                if len(content) < 100:
                    print(f"  ⏭️  Skipping (too short: {len(content)} chars)")
                    stats["files_skipped"] += 1
                    continue

                slug = file_path.stem.lower().replace(' ', '_')[:20]
                chunks = chunk_document(content)
                print(f"  📄 {len(chunks)} chunks · {len(content):,} chars")

                file_entities = 0
                file_axioms = 0

                for j, chunk in enumerate(chunks):
                    if args.dry_run:
                        print(f"  [DRY RUN] Would process chunk {j+1}/{len(chunks)}")
                        continue

                    result = await extract_with_gemini(chunk, file_path.name, slug, secrets)

                    # Write entities
                    for entity in result.get('entities', []):
                        # Auto-index chunk number into ID
                        entity['id'] = f"dt_entity_{slug}_{j}_{file_entities}"
                        if write_entity(ef, entity, file_path.name, existing_hashes):
                            file_entities += 1
                            stats["entities_added"] += 1

                    # Write axioms
                    for axiom in result.get('axioms', []):
                        axiom['id'] = f"dt_axiom_{slug}_{j}_{file_axioms}"
                        if write_axiom(af, axiom, file_path.name, existing_hashes):
                            file_axioms += 1
                            stats["axioms_added"] += 1

                    # Collect next actions
                    stats["next_actions"].extend(result.get('next_actions', []))

                    # Small delay to avoid rate limiting
                    await asyncio.sleep(0.5)

                print(f"  ✅ +{file_entities} entities, +{file_axioms} axioms")
                stats["files_processed"] += 1

            except Exception as e:
                print(f"  ❌ Error: {e}")
                stats["errors"].append(f"{file_path.name}: {str(e)}")

    # Write report
    report = f"""# Deep Think KG Ingestion Report
**Date**: {datetime.now().strftime("%Y-%m-%d %H:%M")}
**Mode**: {"DRY RUN" if args.dry_run else "LIVE"}

## Summary
- Files processed: {stats["files_processed"]}
- Files skipped: {stats["files_skipped"]}
- Entities added: {stats["entities_added"]}
- Axioms added: {stats["axioms_added"]}
- Errors: {len(stats["errors"])}

## Next Actions Extracted
{chr(10).join(f"- {a}" for a in stats["next_actions"][:20])}

## Errors
{chr(10).join(f"- {e}" for e in stats["errors"]) or "None"}

## Output Files
- Entities: {ENTITIES_OUT}
- Axioms: {AXIOMS_OUT}
"""
    REPORT_OUT.write_text(report)

    print(f"\n{'='*50}")
    print(f"✅ INGESTION COMPLETE")
    print(f"   Files: {stats['files_processed']} processed, {stats['files_skipped']} skipped")
    print(f"   Entities: +{stats['entities_added']} added")
    print(f"   Axioms: +{stats['axioms_added']} added")
    print(f"   Report: {REPORT_OUT}")

    if not args.dry_run and stats["entities_added"] + stats["axioms_added"] > 0:
        print(f"\n🚀 NEXT STEP: Run RLM Bloodstream on new axioms:")
        print(f"   python3 core/rlm_bloodstream_pipeline.py --input {AXIOMS_OUT}")

def main():
    parser = argparse.ArgumentParser(description="Deep Think Results → Knowledge Graph Ingestion")
    parser.add_argument('--verify', action='store_true', help='Show KG statistics')
    parser.add_argument('--query', type=str, help='Semantic search query')
    parser.add_argument('--dry-run', action='store_true', help='Preview without writing')
    parser.add_argument('--file', type=str, help='Process only files matching this name')
    parser.add_argument('--batch-size', type=int, default=5, help='Batch size for processing')
    parser.add_argument('--vectorise', action='store_true', help='Also push to Qdrant after ingest')
    args = parser.parse_args()

    if args.verify:
        verify_kg()
        return

    if args.query:
        secrets = load_secrets()
        semantic_query(args.query, secrets)
        return

    print("🧠 Deep Think → Knowledge Graph Ingestion Pipeline")
    print(f"   Source: {DT_DIR}")
    print(f"   Output: {KG_DIR}")
    print(f"   Mode: {'DRY RUN' if args.dry_run else 'LIVE WRITE'}")
    print()

    asyncio.run(run_ingest(args))

if __name__ == "__main__":
    main()