#!/usr/bin/env python3
"""
Genesis Deep Think Knowledge Extractor
========================================
Fires MiniMax M2.5 agents via OpenRouter to extract structured knowledge
from Deep Think result markdown files into the RLM Bloodstream.

Extracts 8 knowledge types:
  ARCHITECTURAL_DECISION, PRODUCT_SPEC, STRATEGIC_SEQUENCE,
  COMPETITIVE_MOAT, UNIT_ECONOMICS, PSYCHOLOGICAL_TACTIC,
  INTEGRATION_POINT, IMPLEMENTATION_BLUEPRINT

Usage:
    python3 dt_knowledge_extractor.py                     # Process all DT files
    python3 dt_knowledge_extractor.py --dry-run            # Show what would be processed
    python3 dt_knowledge_extractor.py --max-concurrent 5   # Limit concurrency
    python3 dt_knowledge_extractor.py --file DT3           # Process specific file (partial match)
"""

import json
import os
import sys
import asyncio
import aiohttp
import argparse
import time
import re
from pathlib import Path
from datetime import datetime

# ── Paths ────────────────────────────────────────────────────────────────────
DT_DIR = "/mnt/e/genesis-system/deep_think_results"
OUTPUT_DIR = "/mnt/e/genesis-system/data/dt_knowledge"
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

# ── Model config ─────────────────────────────────────────────────────────────
MODEL = "minimax/minimax-m2.5"
OPENROUTER_API_KEY = "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"

MAX_CONCURRENT = 10
MAX_RETRIES = 3
RETRY_DELAY = 3

# ── Extraction prompt ────────────────────────────────────────────────────────
EXTRACTION_PROMPT = """You are a STRUCTURED KNOWLEDGE EXTRACTOR for the Genesis AI system. You are processing a Deep Think analysis report — a strategic/technical deep-dive produced by Gemini for the Genesis team.

Your job is to extract ACTIONABLE STRUCTURED KNOWLEDGE from this document. For each distinct piece of knowledge, output a single JSON object on its own line (JSONL format).

EXTRACT THESE 8 TYPES:

1. **ARCHITECTURAL_DECISION** — A specific technical architecture choice with clear rationale.
   Fields: type, title, content (the decision + rationale in 2-5 sentences), tags, confidence

2. **PRODUCT_SPEC** — A product definition, feature set, pricing, tier structure, or economic model.
   Fields: type, title, content (the spec in detail), tags, confidence

3. **STRATEGIC_SEQUENCE** — An ordered sequence of steps/phases/dominos with dependencies. The ORDER matters.
   Fields: type, title, content (the sequence described), steps (array of step strings), tags, confidence

4. **COMPETITIVE_MOAT** — A defensibility mechanism, unique advantage, or competitive barrier.
   Fields: type, title, content (what the moat is and why it works), tags, confidence

5. **UNIT_ECONOMICS** — A cost, price, margin, or financial calculation with numbers.
   Fields: type, title, content (the economics with specific numbers), tags, confidence

6. **PSYCHOLOGICAL_TACTIC** — A behavioral economics insight, conversion technique, or persuasion pattern.
   Fields: type, title, content (the tactic and why it works), tags, confidence

7. **INTEGRATION_POINT** — How two or more products/systems/services connect to each other.
   Fields: type, title, content (what connects, how, and why), tags, confidence

8. **IMPLEMENTATION_BLUEPRINT** — A specific how-to architecture, code pattern, or deployment recipe.
   Fields: type, title, content (the blueprint in detail), tags, confidence

OUTPUT FORMAT (one JSON per line, JSONL — NO markdown fences, NO commentary):
{"type": "PRODUCT_SPEC", "title": "Talking Widget 3-tier pricing", "content": "The Talking Widget uses 3 tiers: Starter at $197/mo...", "tags": ["pricing", "widget", "revenue"], "confidence": 0.95}
{"type": "UNIT_ECONOMICS", "title": "Widget cost-per-conversation", "content": "Each conversation costs approximately $0.087 via Telnyx...", "tags": ["economics", "telnyx", "margin"], "confidence": 0.9}

RULES:
- Extract EVERYTHING of value. Over-extract rather than miss knowledge.
- Preserve SPECIFIC NUMBERS — prices, costs, margins, percentages, timelines.
- Preserve SPECIFIC NAMES — product names, service names, API names, tool names.
- Tags should include relevant domains: revenue, memory, architecture, product, marketing, voice, rlm, aiva, deployment, strategy, pricing, widget, agency, sunaiva, receptionist, browser, mcp, session, security, scaling
- Confidence: 0.9+ for explicit specs/decisions, 0.7-0.9 for derived insights, 0.5-0.7 for speculative items.
- Output ONLY valid JSONL lines. No markdown, no commentary, no wrapping, no ```json blocks.

DOCUMENT TO ANALYZE:
"""


def get_api_key():
    """Get OpenRouter API key."""
    key = os.environ.get('OPENROUTER_API_KEY', '')
    if key:
        return key
    return OPENROUTER_API_KEY


def find_dt_files(dt_dir: str, file_filter: str = None) -> list[str]:
    """Find all Deep Think markdown files, excluding index/summary files."""
    dt_path = Path(dt_dir)
    files = []
    for f in sorted(dt_path.glob("*.md")):
        # Skip index and summary files — they are meta-documents, not DT reports
        if f.name in ("MASTER_INDEX.md",):
            continue
        if file_filter and file_filter.lower() not in f.name.lower():
            continue
        files.append(str(f))
    return files


def chunk_content(content: str, max_chars: int = 28000) -> list[str]:
    """Split large documents into chunks that fit within model context.

    MiniMax M2.5 handles large context but we chunk at ~28K chars to leave
    room for the prompt and ensure quality extraction.
    """
    if len(content) <= max_chars:
        return [content]

    chunks = []
    lines = content.split('\n')
    current_chunk = []
    current_size = 0

    for line in lines:
        line_size = len(line) + 1  # +1 for newline
        if current_size + line_size > max_chars and current_chunk:
            chunks.append('\n'.join(current_chunk))
            current_chunk = []
            current_size = 0
        current_chunk.append(line)
        current_size += line_size

    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks


async def extract_from_chunk(
    session: aiohttp.ClientSession,
    dt_name: str,
    chunk_idx: int,
    chunk_text: str,
    api_key: str,
    semaphore: asyncio.Semaphore,
    stats: dict,
) -> list[dict]:
    """Extract knowledge from a single chunk via MiniMax M2.5."""
    async with semaphore:
        prompt = EXTRACTION_PROMPT + chunk_text

        payload = {
            "model": MODEL,
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 8192,
            "temperature": 0.2,
        }

        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "https://genesis-system.dev",
            "X-Title": "Genesis DT Knowledge Extractor",
        }

        for attempt in range(MAX_RETRIES):
            try:
                async with session.post(
                    OPENROUTER_URL,
                    json=payload,
                    headers=headers,
                    timeout=aiohttp.ClientTimeout(total=180),
                ) as resp:
                    if resp.status == 200:
                        data = await resp.json()
                        content = (
                            data.get("choices", [{}])[0]
                            .get("message", {})
                            .get("content", "")
                        )

                        # Parse JSONL output
                        items = []
                        for line in content.strip().split('\n'):
                            line = line.strip()
                            if not line or line.startswith('```'):
                                continue
                            try:
                                item = json.loads(line)
                                # Enrich with source metadata
                                item["source"] = dt_name
                                item["source_chunk"] = chunk_idx
                                item["extracted_at"] = datetime.now().isoformat()
                                item.setdefault("confidence", 0.8)
                                item.setdefault("tags", [])
                                items.append(item)
                            except json.JSONDecodeError:
                                continue

                        # Track usage
                        usage = data.get("usage", {})
                        stats["input_tokens"] += usage.get("prompt_tokens", 0)
                        stats["output_tokens"] += usage.get("completion_tokens", 0)
                        stats["items"] += len(items)

                        return items

                    elif resp.status == 429:
                        wait = RETRY_DELAY * (2 ** attempt)
                        stats["rate_limits"] += 1
                        print(f"    Rate limited on {dt_name} chunk {chunk_idx}, waiting {wait}s...")
                        await asyncio.sleep(wait)
                        continue

                    else:
                        error_text = await resp.text()
                        if attempt < MAX_RETRIES - 1:
                            await asyncio.sleep(RETRY_DELAY)
                            continue
                        stats["errors"] += 1
                        print(f"    ERROR on {dt_name} chunk {chunk_idx}: HTTP {resp.status} — {error_text[:150]}")
                        return []

            except asyncio.TimeoutError:
                if attempt < MAX_RETRIES - 1:
                    await asyncio.sleep(RETRY_DELAY)
                    continue
                stats["errors"] += 1
                print(f"    TIMEOUT on {dt_name} chunk {chunk_idx}")
                return []

            except Exception as e:
                if attempt < MAX_RETRIES - 1:
                    await asyncio.sleep(RETRY_DELAY)
                    continue
                stats["errors"] += 1
                print(f"    EXCEPTION on {dt_name} chunk {chunk_idx}: {e}")
                return []

        return []


async def process_dt_file(
    session: aiohttp.ClientSession,
    file_path: str,
    api_key: str,
    semaphore: asyncio.Semaphore,
    stats: dict,
) -> list[dict]:
    """Process a single Deep Think file — may spawn multiple chunk tasks."""
    dt_name = Path(file_path).stem  # e.g. "DT3_radar_audit_engine"

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    if len(content.strip()) < 200:
        stats["skipped"] += 1
        print(f"  SKIP {dt_name} (too short: {len(content)} chars)")
        return []

    chunks = chunk_content(content)
    print(f"  Processing {dt_name} ({len(content):,} chars, {len(chunks)} chunk(s))")

    all_items = []
    tasks = [
        extract_from_chunk(session, dt_name, i, chunk, api_key, semaphore, stats)
        for i, chunk in enumerate(chunks)
    ]

    results = await asyncio.gather(*tasks)
    for result in results:
        all_items.extend(result)

    # Save output JSONL
    if all_items:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        out_path = os.path.join(OUTPUT_DIR, f"{dt_name}_knowledge.jsonl")
        with open(out_path, 'w', encoding='utf-8') as f:
            for item in all_items:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')
        print(f"    -> {len(all_items)} items saved to {out_path}")

    stats["processed"] += 1
    return all_items


async def run_extraction(
    files: list[str],
    api_key: str,
    max_concurrent: int,
    dry_run: bool = False,
):
    """Run the extraction swarm across all DT files."""
    print(f"\n{'=' * 70}")
    print(f"  GENESIS DEEP THINK KNOWLEDGE EXTRACTOR")
    print(f"{'=' * 70}")
    print(f"  Files to process : {len(files)}")
    print(f"  Model            : {MODEL}")
    print(f"  Max concurrent   : {max_concurrent}")
    print(f"  Output           : {OUTPUT_DIR}")
    print(f"{'=' * 70}\n")

    if dry_run:
        print("DRY RUN — no API calls will be made.\n")
        for f in files:
            name = Path(f).stem
            size = os.path.getsize(f)
            print(f"  Would process: {name} ({size:,} bytes)")
        return

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    stats = {
        "processed": 0,
        "skipped": 0,
        "errors": 0,
        "rate_limits": 0,
        "items": 0,
        "input_tokens": 0,
        "output_tokens": 0,
        "start_time": time.time(),
    }

    semaphore = asyncio.Semaphore(max_concurrent)
    all_items = []

    async with aiohttp.ClientSession() as session:
        for file_path in files:
            items = await process_dt_file(session, file_path, api_key, semaphore, stats)
            all_items.extend(items)

    elapsed = time.time() - stats["start_time"]
    total_tokens = stats["input_tokens"] + stats["output_tokens"]
    # MiniMax M2.5 pricing: ~$0.30/MTok input, ~$0.30/MTok output (approximate)
    estimated_cost = total_tokens * 0.3 / 1_000_000

    # Save combined output
    if all_items:
        combined_path = os.path.join(OUTPUT_DIR, "ALL_DT_KNOWLEDGE.jsonl")
        with open(combined_path, 'w', encoding='utf-8') as f:
            for item in all_items:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')

    # Type distribution
    type_counts = {}
    for item in all_items:
        t = item.get("type", "UNKNOWN")
        type_counts[t] = type_counts.get(t, 0) + 1

    print(f"\n{'=' * 70}")
    print(f"  EXTRACTION COMPLETE")
    print(f"{'=' * 70}")
    print(f"  Files processed    : {stats['processed']}")
    print(f"  Files skipped      : {stats['skipped']}")
    print(f"  Errors             : {stats['errors']}")
    print(f"  Rate limits hit    : {stats['rate_limits']}")
    print(f"  Knowledge items    : {stats['items']}")
    print(f"  Total tokens       : {total_tokens:,}")
    print(f"  Estimated cost     : ${estimated_cost:.4f}")
    print(f"  Time               : {elapsed:.1f}s ({elapsed / 60:.1f} min)")
    print(f"\n  Type Distribution:")
    for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"    {t:30s} : {count}")
    print(f"\n  Output: {OUTPUT_DIR}")
    if all_items:
        print(f"  Combined: {OUTPUT_DIR}/ALL_DT_KNOWLEDGE.jsonl")
    print(f"{'=' * 70}")

    # Save run report
    report = {
        "timestamp": datetime.now().isoformat(),
        "model": MODEL,
        "files_total": len(files),
        "stats": stats,
        "type_counts": type_counts,
        "elapsed_seconds": elapsed,
        "estimated_cost": f"${estimated_cost:.4f}",
        "total_tokens": total_tokens,
    }
    report_path = os.path.join(OUTPUT_DIR, "EXTRACTION_REPORT.json")
    with open(report_path, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, default=str)
    print(f"  Report: {report_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Genesis Deep Think Knowledge Extractor"
    )
    parser.add_argument(
        "--dry-run", action="store_true", help="Show what would be processed"
    )
    parser.add_argument(
        "--max-concurrent",
        type=int,
        default=MAX_CONCURRENT,
        help=f"Max concurrent API calls (default: {MAX_CONCURRENT})",
    )
    parser.add_argument(
        "--file",
        help="Process specific DT file (partial name match, e.g. 'DT3')",
    )

    args = parser.parse_args()

    api_key = get_api_key()
    if not api_key and not args.dry_run:
        print("ERROR: No OpenRouter API key found.")
        print("Set OPENROUTER_API_KEY env var.")
        sys.exit(1)

    files = find_dt_files(DT_DIR, args.file)
    if not files:
        print(f"No Deep Think .md files found in {DT_DIR}")
        if args.file:
            print(f"  (filter: '{args.file}')")
        sys.exit(1)

    asyncio.run(run_extraction(files, api_key, args.max_concurrent, args.dry_run))


if __name__ == "__main__":
    main()