#!/usr/bin/env python3
"""
Ingest ALL Deep Think results into SubAIVA KnowledgeStore.
Reads .md and .json files from deep_think_results/ (including READABLE/ subdirs).
Batches of 20 documents per POST to swarm/ingest endpoint.
"""

import json
import os
import sys
import time
import requests
from pathlib import Path

ENDPOINT = "https://subaiva.kinan-ae7.workers.dev/api/swarm/ingest"
STATS_ENDPOINT = "https://subaiva.kinan-ae7.workers.dev/api/swarm/stats"
TOKEN = "dev-token-genesis-2026"
HEADERS = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {TOKEN}"
}
BATCH_SIZE = 20
CONCURRENCY = 50
MAX_CONTENT_SIZE = 100_000  # 100KB max per document

DT_ROOT = Path("/mnt/e/genesis-system/deep_think_results")


def collect_files():
    """Collect all .md and .json files, skip screenshots/ and binary files."""
    files = []
    for f in sorted(DT_ROOT.rglob("*")):
        if not f.is_file():
            continue
        # Skip screenshots and binary directories
        if "screenshots" in str(f):
            continue
        if f.suffix.lower() not in (".md", ".json"):
            continue
        files.append(f)
    return files


def file_to_documents(filepath: Path) -> list:
    """Convert a file to one or more document dicts for the API."""
    try:
        content = filepath.read_text(encoding="utf-8", errors="replace")
    except Exception as e:
        print(f"  ERROR reading {filepath}: {e}")
        return []

    if not content.strip():
        return []

    rel_path = str(filepath.relative_to(DT_ROOT.parent))
    source_type = "json" if filepath.suffix == ".json" else "markdown"

    # Determine category based on filename/path
    name_lower = filepath.stem.lower()
    if "memory_upgrade" in name_lower:
        category = "deep_think_memory"
    elif "rlm" in name_lower:
        category = "deep_think_rlm"
    elif "revenue" in name_lower or "pricing" in name_lower:
        category = "deep_think_revenue"
    elif "openclaw" in name_lower:
        category = "deep_think_openclaw"
    elif "browser" in name_lower:
        category = "deep_think_browser"
    elif "session" in name_lower or "continuity" in name_lower:
        category = "deep_think_session"
    elif "gemini" in name_lower:
        category = "deep_think_gemini"
    elif "agent" in name_lower or "marketplace" in name_lower:
        category = "deep_think_agents"
    elif "patent" in name_lower:
        category = "deep_think_patent"
    elif "voice" in name_lower or "audio" in name_lower:
        category = "deep_think_voice"
    elif "website" in name_lower or "poach" in name_lower:
        category = "deep_think_website"
    elif "phone" in name_lower:
        category = "deep_think_phone"
    elif "key_decision" in name_lower:
        category = "deep_think_decisions"
    elif "refire" in str(filepath).lower():
        category = "deep_think_refire"
    elif "readable" in str(filepath).lower():
        category = "deep_think_readable"
    else:
        category = "deep_think"

    # Build human-readable title
    title = filepath.stem.replace("_", " ")
    # Add subdirectory context if applicable
    if filepath.parent != DT_ROOT:
        subdir = filepath.parent.name
        title = f"[{subdir}] {title}"

    # If content is too large, split into chunks
    if len(content) > MAX_CONTENT_SIZE:
        docs = []
        chunks = []
        current_chunk = ""
        for line in content.splitlines(keepends=True):
            if len(current_chunk) + len(line) > MAX_CONTENT_SIZE:
                chunks.append(current_chunk)
                current_chunk = line
            else:
                current_chunk += line
        if current_chunk:
            chunks.append(current_chunk)

        for i, chunk in enumerate(chunks):
            docs.append({
                "source": rel_path,
                "sourceType": source_type,
                "title": f"{title} (Part {i+1}/{len(chunks)})",
                "content": chunk,
                "category": category
            })
        return docs

    return [{
        "source": rel_path,
        "sourceType": source_type,
        "title": title,
        "content": content,
        "category": category
    }]


def post_batch(batch: list, batch_num: int, total_batches: int) -> dict:
    """POST a batch of documents to the ingest endpoint."""
    payload = {
        "documents": batch,
        "concurrency": CONCURRENCY
    }

    try:
        resp = requests.post(ENDPOINT, headers=HEADERS, json=payload, timeout=300)
        resp.raise_for_status()
        result = resp.json()
        succeeded = result.get("succeeded", 0)
        failed = result.get("failed", 0)
        tokens = result.get("totalTokens", 0)
        avg_ms = result.get("avgProcessingMs", 0)
        print(f"  Batch {batch_num}/{total_batches}: {succeeded} ok, {failed} fail, {tokens} tokens, {avg_ms:.0f}ms avg")
        return result
    except requests.exceptions.Timeout:
        print(f"  Batch {batch_num}/{total_batches}: TIMEOUT (300s)")
        return {"succeeded": 0, "failed": len(batch), "totalTokens": 0, "error": "timeout"}
    except Exception as e:
        print(f"  Batch {batch_num}/{total_batches}: ERROR - {e}")
        return {"succeeded": 0, "failed": len(batch), "totalTokens": 0, "error": str(e)}


def get_stats():
    """GET current swarm stats."""
    try:
        resp = requests.get(STATS_ENDPOINT, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        return resp.json()
    except Exception as e:
        print(f"Stats error: {e}")
        return None


def main():
    print("=" * 70)
    print("DEEP THINK RESULTS -> SubAIVA KnowledgeStore Ingestion")
    print("=" * 70)

    # Get pre-ingestion stats
    print("\nPre-ingestion stats:")
    pre_stats = get_stats()
    if pre_stats:
        print(f"  Total documents in store: {pre_stats.get('totalDocuments', 'N/A')}")
        print(f"  Total tokens used: {pre_stats.get('totalTokens', 'N/A')}")

    # Collect all files
    files = collect_files()
    print(f"\nFiles found: {len(files)}")

    # Show breakdown
    md_files = [f for f in files if f.suffix == ".md"]
    json_files = [f for f in files if f.suffix == ".json"]
    root_files = [f for f in files if f.parent == DT_ROOT]
    readable_files = [f for f in files if "READABLE" in str(f) and "REFIRE" not in str(f)]
    refire_files = [f for f in files if "REFIRE" in str(f)]
    print(f"  .md files:        {len(md_files)}")
    print(f"  .json files:      {len(json_files)}")
    print(f"  Root level:       {len(root_files)}")
    print(f"  READABLE/:        {len(readable_files)}")
    print(f"  READABLE/REFIRE/: {len(refire_files)}")

    # Convert all files to documents (handles chunking of large files)
    all_docs = []
    for filepath in files:
        docs = file_to_documents(filepath)
        all_docs.extend(docs)

    print(f"\nTotal documents (after chunking): {len(all_docs)}")
    total_content_size = sum(len(d["content"]) for d in all_docs)
    print(f"Total content size: {total_content_size / 1024 / 1024:.2f} MB")

    # Show category breakdown
    categories = {}
    for d in all_docs:
        cat = d["category"]
        categories[cat] = categories.get(cat, 0) + 1
    print("\nCategory breakdown:")
    for cat, count in sorted(categories.items(), key=lambda x: -x[1]):
        print(f"  {cat}: {count}")

    # Batch into groups of BATCH_SIZE
    batches = []
    for i in range(0, len(all_docs), BATCH_SIZE):
        batches.append(all_docs[i:i + BATCH_SIZE])

    print(f"\nBatches: {len(batches)} (batch size: {BATCH_SIZE})")
    print()

    # Execute batches
    total_succeeded = 0
    total_failed = 0
    total_tokens = 0

    start_time = time.time()

    for i, batch in enumerate(batches, 1):
        batch_titles = [d["title"][:60] for d in batch[:3]]
        print(f"Batch {i}/{len(batches)}: {len(batch)} docs [{', '.join(batch_titles)}...]")

        result = post_batch(batch, i, len(batches))
        total_succeeded += result.get("succeeded", 0)
        total_failed += result.get("failed", 0)
        total_tokens += result.get("totalTokens", 0)

        # Small delay between batches
        if i < len(batches):
            time.sleep(1)

    elapsed = time.time() - start_time

    print()
    print("=" * 70)
    print("INGESTION COMPLETE")
    print("=" * 70)
    print(f"Time elapsed:    {elapsed:.1f}s")
    print(f"Total documents: {len(all_docs)}")
    print(f"Succeeded:       {total_succeeded}")
    print(f"Failed:          {total_failed}")
    print(f"Total tokens:    {total_tokens}")

    # Get final stats
    print()
    print("Post-ingestion stats:")
    post_stats = get_stats()
    if post_stats:
        print(json.dumps(post_stats, indent=2))

    return total_failed == 0


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)
