#!/usr/bin/env python3
"""
Genesis Session Ingestion Swarm
================================
Fires MiniMax 2.5 agents via OpenRouter to extract reasoning chains,
strategic insights, and axioms from session transcript chunks.

Usage:
    python3 session_ingestion_swarm.py                    # Process all chunks
    python3 session_ingestion_swarm.py --dry-run          # Show what would be processed
    python3 session_ingestion_swarm.py --max-concurrent 50 # Limit concurrency
    python3 session_ingestion_swarm.py --session 4346fda6 # Process specific session
"""

import json
import os
import sys
import asyncio
import aiohttp
import argparse
import time
from pathlib import Path
from datetime import datetime

EXTRACTS_DIR = "/mnt/e/genesis-system/data/session_extracts"
OUTPUT_DIR = "/mnt/e/genesis-system/data/session_axioms"
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

# MiniMax 2.5 on OpenRouter
MODEL = "minimax/minimax-m2.5"

MAX_CONCURRENT = 30  # Conservative default, can push to 100+
MAX_RETRIES = 3
RETRY_DELAY = 2

EXTRACTION_PROMPT = """You are a REASONING CHAIN EXTRACTOR for the Genesis AI system. You are processing a dialogue between Kinan (human strategist/founder) and Genesis (AI command centre).

Your job is to extract the REASONING TOPOLOGY — not just what was decided, but HOW insights connected, WHY certain conclusions emerged, and the STRUCTURE of understanding.

For each significant insight chain you find, output a JSON object (one per line, JSONL format).

EXTRACT THESE TYPES:

1. **STRATEGIC_INSIGHT** — A strategic decision or realization with its reasoning chain
2. **REASONING_CHAIN** — A multi-step reasoning sequence where A led to B led to C
3. **ARCHITECTURAL_DECISION** — A technical architecture choice with rationale
4. **PARADIGM_SHIFT** — A fundamental change in understanding or approach
5. **EMERGENT_CONNECTION** — Where two previously separate ideas suddenly connected
6. **DIRECTIVE** — A hardwired rule or mandate from Kinan
7. **PRODUCT_DECISION** — Pricing, feature, positioning, or go-to-market decision
8. **BLOCKER_IDENTIFIED** — A critical blocker or risk discovered
9. **METAPHOR** — A powerful metaphor that encapsulates a complex insight

OUTPUT FORMAT (one JSON per line, JSONL):
```
{"type": "STRATEGIC_INSIGHT", "title": "Short title", "insight": "The core insight in 1-2 sentences", "reasoning_chain": ["Step 1 that led to this", "Step 2 building on step 1", "Step 3 conclusion"], "confidence": 0.95, "tags": ["revenue", "strategy"], "source_context": "Brief quote or paraphrase from the dialogue that triggered this"}
```

RULES:
- Extract EVERYTHING of value. Better to over-extract than miss gold.
- Preserve the REASONING CHAIN — how did they get there? What connected?
- Include the emotional/intuitive dimension ("this feels right because...")
- Tag with relevant domains: revenue, memory, architecture, product, marketing, voice, rlm, aiva, deployment, strategy, patents, pricing
- Confidence: 0.9+ for explicit decisions, 0.7-0.9 for implied insights, 0.5-0.7 for tentative connections
- Output ONLY valid JSONL lines. No markdown, no commentary, no wrapping.

DIALOGUE TO ANALYZE:
"""


def get_openrouter_key():
    """Get OpenRouter API key from multiple sources."""
    # Check env first
    key = os.environ.get('OPENROUTER_API_KEY', '')
    if key:
        return key

    # Check credentials file
    cred_paths = [
        '/mnt/e/genesis-system/config/secrets.env',
        '/mnt/e/genesis-system/Credentials/Genesis Credentials additions.txt',
    ]

    for path in cred_paths:
        if os.path.exists(path):
            try:
                with open(path, 'r') as f:
                    content = f.read()
                for line in content.split('\n'):
                    if 'OPENROUTER' in line.upper() and '=' in line:
                        key = line.split('=', 1)[1].strip().strip('"').strip("'")
                        if key and len(key) > 20:
                            return key
                    # Also check for bare key format
                    if line.strip().startswith('sk-or-'):
                        return line.strip()
            except:
                continue

    return ''


async def process_chunk(session: aiohttp.ClientSession, chunk_path: str,
                        output_dir: str, api_key: str, semaphore: asyncio.Semaphore,
                        stats: dict) -> dict:
    """Process a single chunk through MiniMax 2.5."""
    async with semaphore:
        chunk_name = os.path.basename(chunk_path)
        session_id = os.path.basename(os.path.dirname(chunk_path))

        try:
            with open(chunk_path, 'r') as f:
                chunk_text = f.read()

            if len(chunk_text.strip()) < 100:
                stats['skipped'] += 1
                return {'status': 'skipped', 'reason': 'too_short'}

            prompt = EXTRACTION_PROMPT + chunk_text

            payload = {
                "model": MODEL,
                "messages": [
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": 4096,
                "temperature": 0.3,
            }

            headers = {
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json",
                "HTTP-Referer": "https://genesis-system.dev",
                "X-Title": "Genesis Session Ingestion"
            }

            for attempt in range(MAX_RETRIES):
                try:
                    async with session.post(OPENROUTER_URL, json=payload,
                                            headers=headers, timeout=aiohttp.ClientTimeout(total=120)) as resp:
                        if resp.status == 200:
                            data = await resp.json()
                            content = data.get('choices', [{}])[0].get('message', {}).get('content', '')

                            # Parse and validate JSONL output
                            axioms = []
                            for line in content.strip().split('\n'):
                                line = line.strip()
                                if not line or line.startswith('```'):
                                    continue
                                try:
                                    axiom = json.loads(line)
                                    axiom['source_session'] = session_id
                                    axiom['source_chunk'] = chunk_name
                                    axiom['extracted_at'] = datetime.now().isoformat()
                                    axioms.append(axiom)
                                except json.JSONDecodeError:
                                    continue

                            # Save axioms
                            if axioms:
                                out_path = os.path.join(output_dir, session_id)
                                os.makedirs(out_path, exist_ok=True)
                                axiom_file = os.path.join(out_path, f'{chunk_name.replace(".txt", "")}_axioms.jsonl')
                                with open(axiom_file, 'w') as f:
                                    for a in axioms:
                                        f.write(json.dumps(a) + '\n')

                            stats['processed'] += 1
                            stats['axioms'] += len(axioms)

                            usage = data.get('usage', {})
                            stats['input_tokens'] += usage.get('prompt_tokens', 0)
                            stats['output_tokens'] += usage.get('completion_tokens', 0)

                            return {
                                'status': 'success',
                                'axioms': len(axioms),
                                'tokens': usage.get('total_tokens', 0)
                            }

                        elif resp.status == 429:
                            # Rate limited
                            wait = RETRY_DELAY * (attempt + 1)
                            stats['rate_limits'] += 1
                            await asyncio.sleep(wait)
                            continue

                        else:
                            error_text = await resp.text()
                            if attempt < MAX_RETRIES - 1:
                                await asyncio.sleep(RETRY_DELAY)
                                continue
                            stats['errors'] += 1
                            return {'status': 'error', 'code': resp.status, 'error': error_text[:200]}

                except asyncio.TimeoutError:
                    if attempt < MAX_RETRIES - 1:
                        await asyncio.sleep(RETRY_DELAY)
                        continue
                    stats['errors'] += 1
                    return {'status': 'timeout'}

                except Exception as e:
                    if attempt < MAX_RETRIES - 1:
                        await asyncio.sleep(RETRY_DELAY)
                        continue
                    stats['errors'] += 1
                    return {'status': 'error', 'error': str(e)[:200]}

        except Exception as e:
            stats['errors'] += 1
            return {'status': 'error', 'error': str(e)[:200]}


async def run_swarm(chunks: list[str], api_key: str, max_concurrent: int, dry_run: bool = False):
    """Run the extraction swarm across all chunks."""
    print(f"\n{'='*60}")
    print(f"GENESIS SESSION INGESTION SWARM")
    print(f"{'='*60}")
    print(f"Chunks to process: {len(chunks)}")
    print(f"Model: {MODEL}")
    print(f"Max concurrent: {max_concurrent}")
    print(f"Output: {OUTPUT_DIR}")
    print(f"{'='*60}\n")

    if dry_run:
        print("DRY RUN — no API calls will be made.")
        for chunk in chunks[:10]:
            print(f"  Would process: {chunk}")
        if len(chunks) > 10:
            print(f"  ... and {len(chunks) - 10} more")
        return

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    stats = {
        'processed': 0, 'skipped': 0, 'errors': 0, 'axioms': 0,
        'rate_limits': 0, 'input_tokens': 0, 'output_tokens': 0,
        'start_time': time.time()
    }

    semaphore = asyncio.Semaphore(max_concurrent)

    async with aiohttp.ClientSession() as session:
        tasks = [
            process_chunk(session, chunk, OUTPUT_DIR, api_key, semaphore, stats)
            for chunk in chunks
        ]

        # Process with progress reporting
        completed = 0
        for coro in asyncio.as_completed(tasks):
            result = await coro
            completed += 1
            if completed % 10 == 0 or completed == len(chunks):
                elapsed = time.time() - stats['start_time']
                rate = completed / elapsed if elapsed > 0 else 0
                print(f"  [{completed}/{len(chunks)}] "
                      f"axioms={stats['axioms']} "
                      f"errors={stats['errors']} "
                      f"rate={rate:.1f}/s "
                      f"tokens={stats['input_tokens'] + stats['output_tokens']:,}")

    elapsed = time.time() - stats['start_time']
    total_tokens = stats['input_tokens'] + stats['output_tokens']
    estimated_cost = total_tokens * 0.3 / 1_000_000  # MiniMax pricing

    print(f"\n{'='*60}")
    print(f"SWARM COMPLETE")
    print(f"{'='*60}")
    print(f"Processed: {stats['processed']}")
    print(f"Skipped: {stats['skipped']}")
    print(f"Errors: {stats['errors']}")
    print(f"Rate limits hit: {stats['rate_limits']}")
    print(f"Total axioms extracted: {stats['axioms']}")
    print(f"Total tokens: {total_tokens:,}")
    print(f"Estimated cost: ${estimated_cost:.2f}")
    print(f"Time: {elapsed:.1f}s ({elapsed/60:.1f} min)")
    print(f"Output: {OUTPUT_DIR}")
    print(f"{'='*60}")

    # Save run report
    report = {
        'timestamp': datetime.now().isoformat(),
        'model': MODEL,
        'chunks_total': len(chunks),
        **stats,
        'elapsed_seconds': elapsed,
        'estimated_cost': f"${estimated_cost:.2f}",
        'total_tokens': total_tokens
    }
    report_path = os.path.join(OUTPUT_DIR, 'INGESTION_REPORT.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2, default=str)
    print(f"Report: {report_path}")


def find_all_chunks(extracts_dir: str, session_filter: str = None) -> list[str]:
    """Find all chunk files to process."""
    chunks = []
    for session_dir in sorted(Path(extracts_dir).iterdir()):
        if not session_dir.is_dir():
            continue
        if session_filter and session_filter not in session_dir.name:
            continue
        for chunk_file in sorted(session_dir.glob('chunk_*.txt')):
            chunks.append(str(chunk_file))
    return chunks


def main():
    parser = argparse.ArgumentParser(description='Genesis Session Ingestion Swarm')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be processed')
    parser.add_argument('--max-concurrent', type=int, default=MAX_CONCURRENT)
    parser.add_argument('--session', help='Process specific session (UUID prefix)')
    parser.add_argument('--model', default=MODEL, help='Override model')
    args = parser.parse_args()

    if args.model != MODEL:
        pass  # Use args.model below
    model_to_use = args.model

    # Get API key
    api_key = get_openrouter_key()
    if not api_key and not args.dry_run:
        print("ERROR: No OpenRouter API key found.")
        print("Set OPENROUTER_API_KEY env var or add to config/secrets.env")
        sys.exit(1)

    # Find chunks
    chunks = find_all_chunks(EXTRACTS_DIR, args.session)
    if not chunks:
        print("No chunks found. Run session_transcript_extractor.py first.")
        sys.exit(1)

    # Run swarm
    asyncio.run(run_swarm(chunks, api_key, args.max_concurrent, args.dry_run))


if __name__ == '__main__':
    main()