#!/usr/bin/env python3
"""
Research Reports -> SubAIVA KnowledgeStore Ingestion
Reads ALL .md files from Research reports/ and ingests via swarm endpoint.
"""
import os
import json
import ssl
import sys
import urllib.request
import time

SUBAIVA_URL = "https://subaiva.kinan-ae7.workers.dev"
API_TOKEN = "dev-token-genesis-2026"
BASE_DIR = "/mnt/e/genesis-system/Research reports"
BATCH_SIZE = 12
MAX_CONTENT_CHARS = 50000  # generous limit per doc

def find_md_files(base):
    """Recursively find all .md files."""
    files = []
    for root, dirs, fnames in os.walk(base):
        for f in fnames:
            if f.endswith('.md'):
                files.append(os.path.join(root, f))
    files.sort()
    return files

def read_file(path):
    """Read file content."""
    try:
        with open(path, 'r', encoding='utf-8', errors='replace') as f:
            content = f.read()
        # Truncate if extremely large
        if len(content) > MAX_CONTENT_CHARS:
            content = content[:MAX_CONTENT_CHARS] + "\n\n[TRUNCATED - original was " + str(len(content)) + " chars]"
        return content
    except Exception as e:
        print(f"  ERROR reading {path}: {e}", file=sys.stderr)
        return None

def send_batch(documents, batch_num, total_batches):
    """Send a batch to the swarm ingest endpoint."""
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    payload = json.dumps({
        "documents": documents,
        "concurrency": 50,
    }).encode('utf-8')

    req = urllib.request.Request(
        f"{SUBAIVA_URL}/api/swarm/ingest",
        data=payload,
        headers={
            "Authorization": f"Bearer {API_TOKEN}",
            "Content-Type": "application/json",
            "User-Agent": "Mozilla/5.0 Genesis/1.0 ResearchIngest",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(req, context=ctx, timeout=300) as resp:
            return json.loads(resp.read().decode())
    except urllib.error.HTTPError as e:
        body = e.read().decode() if e.fp else "no body"
        return {"error": f"HTTP {e.code}: {body}", "succeeded": 0, "failed": len(documents)}
    except Exception as e:
        return {"error": str(e), "succeeded": 0, "failed": len(documents)}

def main():
    print(f"=== Research Reports -> SubAIVA KnowledgeStore Ingestion ===")
    print(f"Endpoint: {SUBAIVA_URL}/api/swarm/ingest")
    print(f"Batch size: {BATCH_SIZE}")
    print()

    # Find all files
    all_files = find_md_files(BASE_DIR)
    print(f"Found {len(all_files)} .md files")

    # Build document list
    documents = []
    skipped = 0
    for fpath in all_files:
        content = read_file(fpath)
        if not content or len(content.strip()) < 50:
            skipped += 1
            print(f"  SKIP (too short/empty): {os.path.basename(fpath)}")
            continue

        # Relative path for source
        rel = os.path.relpath(fpath, "/mnt/e/genesis-system")
        title = os.path.splitext(os.path.basename(fpath))[0]

        documents.append({
            "source": rel,
            "sourceType": "markdown",
            "title": title,
            "content": content,
            "category": "genesis",
        })

    print(f"\nDocuments ready: {len(documents)} (skipped {skipped})")
    print()

    # Send in batches
    total_success = 0
    total_fail = 0
    total_tokens = 0
    errors = []

    total_batches = (len(documents) + BATCH_SIZE - 1) // BATCH_SIZE

    for i in range(0, len(documents), BATCH_SIZE):
        batch = documents[i:i + BATCH_SIZE]
        batch_num = (i // BATCH_SIZE) + 1
        titles = [d["title"][:40] for d in batch]

        print(f"Batch {batch_num}/{total_batches} ({len(batch)} docs)...")
        result = send_batch(batch, batch_num, total_batches)

        if "error" in result:
            total_fail += len(batch)
            errors.append((batch_num, result["error"], titles))
            print(f"  FAILED: {result['error'][:200]}")
        else:
            s = result.get("succeeded", 0)
            f = result.get("failed", 0)
            t = result.get("totalTokens", 0)
            avg = result.get("avgProcessingMs", 0)
            total_success += s
            total_fail += f
            total_tokens += t
            print(f"  OK: {s}/{len(batch)} succeeded | tokens: {t:,} | avg: {avg}ms")
            if f > 0:
                errors.append((batch_num, f"partial: {f} failed", titles))

        # Small delay between batches to be polite
        if batch_num < total_batches:
            time.sleep(1)

    print(f"\n{'='*60}")
    print(f"=== RESEARCH REPORTS INGESTION COMPLETE ===")
    print(f"{'='*60}")
    print(f"Total documents attempted: {len(documents)}")
    print(f"Succeeded: {total_success}")
    print(f"Failed:    {total_fail}")
    print(f"Tokens:    {total_tokens:,}")
    print(f"Category:  genesis")
    if errors:
        print(f"\nErrors ({len(errors)} batches had issues):")
        for bn, err, titles in errors:
            print(f"  Batch {bn}: {err[:150]}")

    # Now check stats
    print(f"\n--- Checking final stats ---")
    try:
        ctx = ssl.create_default_context()
        ctx.check_hostname = False
        ctx.verify_mode = ssl.CERT_NONE
        req = urllib.request.Request(
            f"{SUBAIVA_URL}/api/swarm/stats",
            headers={
                "Authorization": f"Bearer {API_TOKEN}",
                "User-Agent": "Mozilla/5.0 Genesis/1.0",
            },
        )
        with urllib.request.urlopen(req, context=ctx, timeout=30) as resp:
            stats = json.loads(resp.read().decode())
            print(json.dumps(stats, indent=2))
    except Exception as e:
        print(f"Stats check failed: {e}")

if __name__ == "__main__":
    main()