#!/usr/bin/env python3
"""
Mass DT Response Ingestion → RLM Knowledge Graph
Ingests ALL deep_think_results/ files into KNOWLEDGE_GRAPH/entities + axioms
This is the DT Intel dump Genesis has been missing.

Run: python3 scripts/ingest_all_dt_responses.py
"""

import os
import json
import hashlib
from datetime import datetime
from pathlib import Path

# Paths
BASE = Path("E:/genesis-system")
DT_DIR = BASE / "deep_think_results"
ENTITIES_DIR = BASE / "KNOWLEDGE_GRAPH" / "entities"
AXIOMS_DIR = BASE / "KNOWLEDGE_GRAPH" / "axioms"
OUTPUT_DATE = datetime.now().strftime("%Y_%m_%d")

# Files already ingested (don't duplicate)
ALREADY_INGESTED = {
    "REAL_DEEP_THINK_TEST_001.md",
    "REAL_DT_GENESIS_STRATEGY.md",
}

# DT file → structured metadata mapping
DT_METADATA = {
    "DT1_revenue_pathway_analysis.md": {
        "domain": "revenue_strategy",
        "key_insight": "Single product focus — Talking Website Widget via agency channel = fastest $1K MRR path",
        "actionability": "HIGH",
        "tags": ["revenue", "agency_channel", "talking_widget", "product_focus"]
    },
    "DT2_autonomous_revenue_engine.md": {
        "domain": "autonomous_acquisition",
        "key_insight": "Autonomous acquisition loop: Opus→OpenClaw scrape→RLM analyse→OpenClaw WhatsApp+email→Voice→Stripe. Zero human involvement.",
        "actionability": "HIGH",
        "tags": ["autonomous_sales", "openclaw", "acquisition_loop", "revenue_engine"]
    },
    "DT3_radar_audit_engine.md": {
        "domain": "lead_intelligence",
        "key_insight": "P4 Radar: real-time digital health scoring for lead targets. Tech stack, PageSpeed, SEO, social signals.",
        "actionability": "HIGH",
        "tags": ["p4_radar", "lead_scoring", "audit_engine", "competitive_intelligence"]
    },
    "DT4_browser_agents_marketplace.md": {
        "domain": "browser_automation",
        "key_insight": "Vision-first browser interaction = fleet immunity. Agent A's UI fix propagates via Redis Pub/Sub to all 500 agents instantly.",
        "actionability": "HIGH",
        "tags": ["browser_agents", "vision_first", "fleet_immunity", "redis_propagation"]
    },
    "DT5_memory_mcp_lockin.md": {
        "domain": "memory_strategy",
        "key_insight": "Memory MCP lock-in: switching cost exceeds ACV at month 9-12. CLV = 5-7x at 13-24 months. Digital Lobotomy effect.",
        "actionability": "HIGH",
        "tags": ["memory_moat", "lock_in", "clv", "competitive_moat"]
    },
    "DT5_openclaw_permission_paradox.md": {
        "domain": "openclaw_security",
        "key_insight": "OpenClaw Lethal Trifecta: expansive access + external communication + prompt injection vulnerability. Genesis Patent Portfolio = only solution.",
        "actionability": "HIGH",
        "tags": ["openclaw", "security", "prompt_injection", "patent_portfolio", "secureclaw"]
    },
    "DT6_agent_marketplace.md": {
        "domain": "agent_marketplace",
        "key_insight": "L1-L4 agent tiers as product: L1 Basic $497-697, L2 Digital Employee $497-697, L3 Domain Expert $997-1497, L4 Executive Enterprise",
        "actionability": "MEDIUM",
        "tags": ["agent_marketplace", "l1_l4_tiers", "product_tiers", "pricing"]
    },
    "DT7_SESSION_CONTINUITY_TAPESTRY_SUMMARY.md": {
        "domain": "session_continuity",
        "key_insight": "Session DNA YAML (<1000 tokens): mission+agents+decisions+blockers+next_actions. Delta inheritance = only mutations from N-1 (~500 bytes).",
        "actionability": "HIGH",
        "tags": ["session_dna", "context_continuity", "yaml_encoding", "delta_inheritance"]
    },
    "DT8_neurological_session_continuity.md": {
        "domain": "memory_architecture",
        "key_insight": "Neurological model: episodic+semantic+working memory tiers. Surprise-gated storage (score > 0.7 = EPISODIC, 0.3-0.7 = WORKING, <0.3 = DISCARD).",
        "actionability": "HIGH",
        "tags": ["memory_tiers", "surprise_engine", "episodic_memory", "working_memory"]
    },
    "DT9_distributed_systems_continuity.md": {
        "domain": "distributed_memory",
        "key_insight": "Distributed memory across 5 backends: PostgreSQL (episodic), Qdrant (semantic), Redis (working), FalkorDB (graph), file KG (JSONL).",
        "actionability": "HIGH",
        "tags": ["distributed_memory", "postgresql", "qdrant", "redis", "falkordb"]
    },
    "DT10_genetic_session_dna.md": {
        "domain": "session_continuity",
        "key_insight": "Genetic session DNA: epigenetic context tagging, gene activation based on mission. Context pointers avoid full re-read on respawn.",
        "actionability": "HIGH",
        "tags": ["genetic_dna", "epigenetic_context", "session_respawn", "context_pointers"]
    },
    "DT17_audio_device_diagnosis_engine.md": {
        "domain": "voice_infrastructure",
        "key_insight": "Audio device diagnosis engine: full stack from OS audio drivers → WebRTC → Telnyx WebSocket → STT pipeline. 15-point diagnostic tree.",
        "actionability": "MEDIUM",
        "tags": ["audio_diagnosis", "webrtc", "telnyx", "voice_pipeline", "stt"]
    },
    "DT18_Gemini3_Flash_Synergy_RESPONSE.md": {
        "domain": "voice_ai_architecture",
        "key_insight": "Gemini 3 Flash Live WebSocket API = native audio-to-audio (300-400ms). Replaces STT→LLM→TTS cascade (800ms). Single highest-leverage voice integration.",
        "actionability": "HIGH",
        "tags": ["gemini_flash", "audio_to_audio", "latency", "voice_quality", "websocket"]
    },
    "DT19_Gemini3_Self_Integration_RESPONSE.md": {
        "domain": "orchestration_architecture",
        "key_insight": "Fractal Swarm: Claude Opus = CEO, Gemini 3 Flash = Motor Cortex, Kimi/MiniMax = Data Grinders, Gemini 3 Pro = Supreme Court. NEVER Deep Think on voice calls.",
        "actionability": "HIGH",
        "tags": ["fractal_swarm", "model_routing", "compute_tiers", "orchestration", "deep_think"]
    },
    "DT19b_RLM_Titan_Epoch3_RESPONSE.md": {
        "domain": "rlm_architecture",
        "key_insight": "RLMs = O(n) vs O(n^2) standard attention. Data mounted as Python REPL variable, Root (Gemini 3 Pro) writes Map-Reduce, Flash workers process in parallel.",
        "actionability": "HIGH",
        "tags": ["rlm", "map_reduce", "infinite_context", "gemini_flash", "python_repl"]
    },
    "DT20_Patent_Titan_IP_Analysis_RESPONSE.md": {
        "domain": "ip_strategy",
        "key_insight": "9-patent AI security portfolio. SecureClaw = only mathematical solution to OpenClaw's Fortune 500 security crisis. Enterprise pitch to State Farm/Zepto.",
        "actionability": "MEDIUM",
        "tags": ["patent_portfolio", "secureclaw", "enterprise_security", "ip_moat"]
    },
    "DT21_OpenClaw_Integration_Strategy_RESPONSE.md": {
        "domain": "openclaw_integration",
        "key_insight": "OpenClaw = Motor Cortex. NEVER reason with it. 3 roles: CDP Actuator, WhatsApp/Telegram gateway, Enterprise Security Arbitrage (SecureClaw). MCP bridge required.",
        "actionability": "HIGH",
        "tags": ["openclaw", "motor_cortex", "cdp", "whatsapp", "secureclaw", "mcp_bridge"]
    },
    "DT22_OpenClaw_RLM_Swarm_Topology_RESPONSE.md": {
        "domain": "openclaw_rlm",
        "key_insight": "4-Phase Engine: OpenClaw IN (ingest) → Patent Airgap → RLM Swarm THINK → OpenClaw OUT (execute). Blind actuator + Paralyzed brain = symbiotic power.",
        "actionability": "HIGH",
        "tags": ["openclaw", "rlm_swarm", "4_phase_engine", "titan_enclave", "patent_firewall"]
    },
    "DT23_Sovereign_Revenue_Organism_RESPONSE.md": {
        "domain": "enterprise_revenue",
        "key_insight": "SecureClaw Enterprise: 21 clients × $4,997/mo = $104,937 MRR at 94% gross margin. COGS only $300/client. JITSG self-proliferating software factory.",
        "actionability": "HIGH",
        "tags": ["enterprise_revenue", "secureclaw", "jitsg", "mrr", "sovereign_revenue"]
    },
    "DT25_RESPONSE_WEBSITE_POACH_DOMINATOR.md": {
        "domain": "gtm_strategy",
        "key_insight": "Website Poach: 3,886 tradie leads with LocalSearch sites (PageSpeed <40). Clone via 10Web, inject Widget, price-match invoice. 5 clients = $2,495 MRR = default alive.",
        "actionability": "HIGH",
        "tags": ["website_poach", "tradie_leads", "10web", "gtm", "default_alive"]
    },
    "DT25v2_RESPONSE_COMPOSABLE_PRICING_DOMINATOR.md": {
        "domain": "pricing_strategy",
        "key_insight": "Composable pricing: $497/$997/$1,497 AUD/mo. 6 modular upsells. Bundle discounts: 1=0%, 2=15%, 3=25%, 4+=35%. Target ARPC $550/mo. Path to $100K MRR = 182 clients.",
        "actionability": "HIGH",
        "tags": ["composable_pricing", "arpc", "upsells", "bundle_discounts", "mrr_path"]
    },
    "DT26_PHONE_NUMBER_STRATEGY_PROMPT.md": {
        "domain": "voice_infrastructure",
        "key_insight": "Phone number strategy: local area codes (07 for QLD/Cairns) critical for tradie trust. Telnyx provisioning for Australian numbers.",
        "actionability": "HIGH",
        "tags": ["phone_numbers", "area_codes", "telnyx", "australian_numbers", "tradie_trust"]
    },
    "STANDING_AGENT_SUMMARY.md": {
        "domain": "agent_architecture",
        "key_insight": "5 standing agent prompts: Mazza persona load, lead enrichment fan-out, call attribution moat, automated Deep Think scheduling, upsell pain-event triggers.",
        "actionability": "HIGH",
        "tags": ["standing_agents", "mazza_persona", "lead_enrichment", "call_attribution", "deep_think_cron"]
    },
    "DT_VAST_DATA_AGENTIC_MEMORY_10_PROMPTS.md": {
        "domain": "memory_architecture",
        "key_insight": "10 production Deep Think prompts: TitanSurpriseEngine, 3-layer dedup, Ebbinghaus decay daemon, Redis Memory Bus, ContextSelector, KaaS architecture.",
        "actionability": "HIGH",
        "tags": ["surprise_engine", "deduplication", "memory_decay", "memory_bus", "kaas", "context_selector"]
    },
    "DT_session_continuity_tapestry.md": {
        "domain": "session_continuity",
        "key_insight": "Session Continuity Tapestry: full cross-session DNA architecture. Epigenetic context + delta inheritance + pointer-based context loading.",
        "actionability": "HIGH",
        "tags": ["session_tapestry", "epigenetic", "delta_inheritance", "context_dna"]
    },
}

def generate_entity_id(filename, index):
    """Generate a deterministic entity ID."""
    h = hashlib.md5(filename.encode()).hexdigest()[:6]
    return f"dt_bulk_{h}_{index:03d}"

def ingest_dt_file(filepath, metadata, entity_index):
    """Convert a DT file into KG entity + axiom entries."""
    filename = filepath.name
    
    # Skip already ingested
    if filename in ALREADY_INGESTED:
        return None, None
    
    # Read first 500 chars as excerpt
    try:
        content = filepath.read_text(encoding='utf-8', errors='ignore')
        excerpt = content[:500].replace('\n', ' ').replace('"', "'").strip()
    except:
        excerpt = metadata["key_insight"]
    
    entity_id = generate_entity_id(filename, entity_index)
    
    entity = {
        "id": entity_id,
        "name": filename.replace('.md', '').replace('_', ' '),
        "type": "deep_think_response",
        "domain": metadata["domain"],
        "source": f"deep_think_results/{filename}",
        "key_insight": metadata["key_insight"],
        "actionability": metadata["actionability"],
        "tags": metadata["tags"],
        "excerpt": excerpt[:300],
        "date_ingested": datetime.now().isoformat(),
        "source_date": "2026-02-16/2026-02-20",
        "model": "gemini_ultra_thinking",
        "confidence": 0.95
    }
    
    axiom = {
        "id": f"axiom_{entity_id}",
        "entity_ref": entity_id,
        "domain": metadata["domain"],
        "statement": metadata["key_insight"],
        "confidence": 0.95,
        "source": f"deep_think_results/{filename}",
        "tags": metadata["tags"],
        "date": datetime.now().isoformat(),
        "actionable": metadata["actionability"] == "HIGH"
    }
    
    return entity, axiom

def main():
    ENTITIES_DIR.mkdir(parents=True, exist_ok=True)
    AXIOMS_DIR.mkdir(parents=True, exist_ok=True)
    
    entities = []
    axioms = []
    skipped = []
    processed = []
    
    print(f"Scanning {DT_DIR}...")
    
    for i, (filename, metadata) in enumerate(DT_METADATA.items()):
        filepath = DT_DIR / filename
        
        if not filepath.exists():
            print(f"  ⚠️  MISSING: {filename}")
            skipped.append(filename)
            continue
        
        entity, axiom = ingest_dt_file(filepath, metadata, i)
        
        if entity is None:
            print(f"  ⏭️  SKIP (already ingested): {filename}")
            skipped.append(filename)
            continue
        
        entities.append(entity)
        axioms.append(axiom)
        processed.append(filename)
        print(f"  ✅ {filename} → {entity['id']} [{metadata['domain']}]")
    
    # Write entities
    entity_file = ENTITIES_DIR / f"dt_bulk_ingestion_{OUTPUT_DATE}.jsonl"
    with open(entity_file, 'w', encoding='utf-8') as f:
        for e in entities:
            f.write(json.dumps(e) + '\n')
    
    # Write axioms
    axiom_file = AXIOMS_DIR / f"dt_bulk_axioms_{OUTPUT_DATE}.jsonl"
    with open(axiom_file, 'w', encoding='utf-8') as f:
        for a in axioms:
            f.write(json.dumps(a) + '\n')
    
    # OpenClaw-specific entities
    openclaw_entities = [e for e in entities if 'openclaw' in e.get('tags', [])]
    if openclaw_entities:
        openclaw_file = ENTITIES_DIR / f"openclaw_intelligence_{OUTPUT_DATE}.jsonl"
        with open(openclaw_file, 'w', encoding='utf-8') as f:
            for e in openclaw_entities:
                f.write(json.dumps(e) + '\n')
        print(f"\n🔥 OpenClaw-specific entities: {openclaw_file}")
    
    print(f"\n{'='*60}")
    print(f"DT BULK INGESTION COMPLETE")
    print(f"{'='*60}")
    print(f"✅ Processed: {len(processed)} files")
    print(f"⏭️  Skipped:   {len(skipped)} files")
    print(f"📦 Entities:  {entity_file}")
    print(f"📦 Axioms:    {axiom_file}")
    print(f"\n📊 By domain:")
    domains = {}
    for e in entities:
        d = e['domain']
        domains[d] = domains.get(d, 0) + 1
    for domain, count in sorted(domains.items(), key=lambda x: -x[1]):
        print(f"   {domain}: {count}")
    
    print(f"\n🔥 HIGH actionability: {len([e for e in entities if e['actionability'] == 'HIGH'])}")
    print(f"\nRLM workers will pick these up within 5 minutes (KGCrystallizerWorker).")
    
    return len(processed)

if __name__ == "__main__":
    count = main()
    exit(0 if count > 0 else 1)
