#!/usr/bin/env python3
"""
Genesis Multi-Model Swarm
Unified interface for Kimi K2.5, MiniMax M2.5, Gemini 2.0 Flash, and other swarm models.

COMMAND CENTRE DOCTRINE: This module enables Claude (orchestrator) to dispatch
work to cost-effective agent swarms (Kimi, MiniMax, Gemini) while staying
lightweight and responsive.

Cost Hierarchy (per MTok):
- Gemini 2.0 Flash:      ~$0.10 (fastest, best for computer use + agentic vision)
- DeepSeek-R1 Distill:  ~$0.12 (fast deep reasoning, 32B distill)
- DeepSeek-R1:          ~$0.55 (o1-level deep reasoning, 671B MoE — USE FOR STRATEGY)
- Kimi K2.5:            $1.07  (9x cheaper than Claude, #1 agentic benchmarks)
- MiniMax M2.5:         ~$1.00 (80.2% SWE-Bench, free via NVIDIA NIM/Kilo.ai)
- Haiku:                $1.00  (good for simple extractions)
- Sonnet:               $9.00  (management/coordination)
- Opus:                 $15.00 (architecture/strategic only)

Deep Reasoning Routing Guide:
- Use deepseek-r1 for: multi-step strategy, architecture decisions, complex RLM design,
  GTM planning, patent analysis, anything requiring extended chain-of-thought
- Use deepseek-r1-distill for: code review, math problems, faster reasoning tasks
- Use gemini-3-flash-preview for: computer use, agentic vision, screen understanding
- Use kimi-k2-moonshot for: agent swarms, long context, tool-use heavy tasks
- NEVER use Claude for tasks DeepSeek-R1 can handle — 27x cost difference

Author: Genesis System
Date: 2026-02-24
"""

import os
import sys
import json
import subprocess
import time
from typing import Dict, List, Literal, Optional, Any
from enum import Enum
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Try to import OpenAI SDK (used for OpenAI-compatible endpoints)
try:
    from openai import OpenAI
except ImportError:
    logger.error("OpenAI SDK not installed. Run: pip install openai")
    sys.exit(1)

# Try to import Google GenAI SDK (new, non-deprecated: google-genai package)
try:
    import google.genai as genai
    from google.genai import types as genai_types
    GENAI_AVAILABLE = True
except ImportError:
    GENAI_AVAILABLE = False
    logger.warning(
        "google-genai SDK not installed. Gemini native API unavailable. "
        "Run: pip install google-genai"
    )

# Try to load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv('/mnt/e/genesis-system/.env')
    load_dotenv('/mnt/e/genesis-system/config/secrets.env')
except ImportError:
    logger.warning("python-dotenv not installed. Using system environment variables.")


class ModelTier(Enum):
    """Model cost tiers for routing decisions."""
    ULTRA_CHEAP = "ultra_cheap"        # Gemini 2.0 Flash, Haiku
    CHEAP = "cheap"                    # Kimi, MiniMax, DeepSeek
    DEEP_REASONING = "deep_reasoning"  # DeepSeek-R1 — o1-level reasoning at $0.55/MTok
    STANDARD = "standard"              # Sonnet
    PREMIUM = "premium"                # Opus


class SwarmModel:
    """Represents a model available for swarm execution."""

    def __init__(
        self,
        name: str,
        provider: str,
        api_key_env: str,
        base_url: str,
        model_id: str,
        cost_per_mtok: float,
        tier: ModelTier,
        capabilities: List[str],
        max_tokens: int = 4096,
        rpm_limit: Optional[int] = None
    ):
        self.name = name
        self.provider = provider
        self.api_key_env = api_key_env
        self.base_url = base_url
        self.model_id = model_id
        self.cost_per_mtok = cost_per_mtok
        self.tier = tier
        self.capabilities = capabilities
        self.max_tokens = max_tokens
        self.rpm_limit = rpm_limit

    @property
    def api_key(self) -> Optional[str]:
        """Get API key from environment."""
        return os.getenv(self.api_key_env)

    @property
    def available(self) -> bool:
        """Check if model is available (has API key)."""
        return self.api_key is not None


# ─────────────────────────────────────────────────────────────────────────────
# Model Registry
# ─────────────────────────────────────────────────────────────────────────────
SWARM_MODELS: Dict[str, SwarmModel] = {

    # ── Kimi K2.5 — NVIDIA NIM (fallback) ────────────────────────────────────
    "kimi-k25": SwarmModel(
        name="Kimi K2.5",
        provider="NVIDIA NIM",
        api_key_env="NVIDIA_NIM_API_KEY",
        base_url="https://integrate.api.nvidia.com/v1",
        model_id="moonshotai/kimi-k2.5",
        cost_per_mtok=1.07,
        tier=ModelTier.CHEAP,
        capabilities=["code", "research", "long_context", "vision", "agent_swarm"],
        max_tokens=4096,
        rpm_limit=None
    ),

    # ── Kimi K2.5 — Moonshot Direct API (PREFERRED, MOONSHOT_API_KEY) ────────
    # OpenAI-compatible endpoint: https://api.moonshot.cn/v1
    # Best for: agent swarms, tool-use, long-context reasoning, code
    "kimi-k2-moonshot": SwarmModel(
        name="Kimi K2.5 (Moonshot Direct)",
        provider="Moonshot AI",
        api_key_env="MOONSHOT_API_KEY",
        base_url="https://api.moonshot.cn/v1",
        model_id="kimi-k2",
        cost_per_mtok=1.07,
        tier=ModelTier.CHEAP,
        capabilities=["code", "research", "long_context", "vision", "agent_swarm", "reasoning"],
        max_tokens=8192,
        rpm_limit=30
    ),

    # ── Kimi K2.5 — Moonshot Official (legacy key alias) ─────────────────────
    "kimi-k25-official": SwarmModel(
        name="Kimi K2.5 Official",
        provider="Moonshot AI",
        api_key_env="KIMI_API_KEY",
        base_url="https://api.moonshot.cn/v1",
        model_id="kimi-k2",
        cost_per_mtok=1.07,
        tier=ModelTier.CHEAP,
        capabilities=["code", "research", "long_context", "vision", "agent_swarm"],
        max_tokens=8192,
        rpm_limit=30
    ),

    # ── MiniMax ───────────────────────────────────────────────────────────────
    "minimax-m21": SwarmModel(
        name="MiniMax M2.1",
        provider="NVIDIA NIM",
        api_key_env="NVIDIA_NIM_API_KEY",
        base_url="https://api.nvidia.com/v1",
        model_id="minimaxai/minimax-m2.1",
        cost_per_mtok=0.0,  # FREE via NVIDIA NIM
        tier=ModelTier.ULTRA_CHEAP,
        capabilities=["code", "reasoning"],
        max_tokens=2048,
        rpm_limit=40
    ),
    "minimax-m2": SwarmModel(
        name="MiniMax M2",
        provider="NVIDIA NIM",
        api_key_env="NVIDIA_NIM_API_KEY",
        base_url="https://api.nvidia.com/v1",
        model_id="minimaxai/minimax-m2",
        cost_per_mtok=0.0,  # FREE via NVIDIA NIM
        tier=ModelTier.ULTRA_CHEAP,
        capabilities=["code", "reasoning"],
        max_tokens=2048,
        rpm_limit=40
    ),
    "minimax-m25": SwarmModel(
        name="MiniMax M2.5",
        provider="MiniMax AI",
        api_key_env="MINIMAX_API_KEY",
        base_url="https://api.minimax.io/anthropic/v1",
        model_id="minimax-m2.5",
        cost_per_mtok=1.0,
        tier=ModelTier.CHEAP,
        capabilities=["code", "reasoning"],
        max_tokens=2048,
        rpm_limit=None
    ),

    # ── Gemini 2.0 Flash — Native Google API (PREFERRED for Gemini tasks) ────
    # Uses new google-genai SDK (not deprecated google-generativeai)
    # Key: GEMINI_API_KEY_NEW — preferred over GOOGLE_API_KEY
    # Best for: computer_use, agentic_vision, bulk tasks, vision, fast inference
    "gemini-flash": SwarmModel(
        name="Gemini 3 Flash (Direct API — $200 credits)",
        provider="Google",
        api_key_env="GEMINI_API_KEY_NEW",
        base_url=None,  # Native google-genai SDK, no base_url needed
        model_id="gemini-3-flash-preview",
        cost_per_mtok=0.50,
        tier=ModelTier.ULTRA_CHEAP,
        capabilities=["code", "research", "vision", "fast", "computer_use", "agentic_vision", "think_act_observe"],
        max_tokens=8192,
        rpm_limit=None
    ),

    # ── Gemini 3 Flash — OpenRouter (fallback if direct API credits exhaust) ─
    "openrouter-gemini-flash": SwarmModel(
        name="Gemini 3 Flash (OpenRouter fallback)",
        provider="OpenRouter",
        api_key_env="OPENROUTER_API_KEY",
        base_url="https://openrouter.ai/api/v1",
        model_id="google/gemini-3-flash-preview",
        cost_per_mtok=0.50,
        tier=ModelTier.ULTRA_CHEAP,
        capabilities=["code", "research", "vision", "fast", "computer_use", "agentic_vision", "think_act_observe"],
        max_tokens=8192,
        rpm_limit=None
    ),

    # ── Gemini 2.0 Flash Experimental — OpenRouter (free tier fallback) ────────
    "openrouter-gemini-flash-exp": SwarmModel(
        name="Gemini 2.0 Flash Exp (OpenRouter free)",
        provider="OpenRouter",
        api_key_env="OPENROUTER_API_KEY",
        base_url="https://openrouter.ai/api/v1",
        model_id="google/gemini-2.0-flash-exp:free",
        cost_per_mtok=0.0,  # Free tier on OpenRouter
        tier=ModelTier.ULTRA_CHEAP,
        capabilities=["code", "research", "vision", "fast", "computer_use", "agentic_vision"],
        max_tokens=8192,
        rpm_limit=None
    ),

    # ── OpenRouter — Kimi / MiniMax ───────────────────────────────────────────
    "openrouter-kimi": SwarmModel(
        name="Kimi K2.5 (OpenRouter)",
        provider="OpenRouter",
        api_key_env="OPENROUTER_API_KEY",
        base_url="https://openrouter.ai/api/v1",
        model_id="moonshotai/kimi-k2.5",
        cost_per_mtok=1.07,
        tier=ModelTier.CHEAP,
        capabilities=["code", "research", "long_context"],
        max_tokens=4096,
        rpm_limit=None
    ),
    "openrouter-minimax": SwarmModel(
        name="MiniMax M2.5 (OpenRouter)",
        provider="OpenRouter",
        api_key_env="OPENROUTER_API_KEY",
        base_url="https://openrouter.ai/api/v1",
        model_id="minimax/minimax-m2.5",
        cost_per_mtok=1.0,
        tier=ModelTier.CHEAP,
        capabilities=["code", "reasoning"],
        max_tokens=2048,
        rpm_limit=None
    ),
    "openrouter-minimax-m21": SwarmModel(
        name="MiniMax M2.1 (OpenRouter)",
        provider="OpenRouter",
        api_key_env="OPENROUTER_API_KEY",
        base_url="https://openrouter.ai/api/v1",
        model_id="minimax/minimax-m2.1",
        cost_per_mtok=0.80,
        tier=ModelTier.CHEAP,
        capabilities=["code", "reasoning", "agent"],
        max_tokens=2048,
        rpm_limit=None
    ),

    # ── Deep Reasoning Tier (o1-level, ultra cost-efficient) ─────────────────
    # DeepSeek-R1: 671B MoE, reinforcement learning enhanced
    # Best for: multi-step reasoning, strategy, architecture, complex analysis
    "deepseek-r1": SwarmModel(
        name="DeepSeek-R1",
        provider="OpenRouter",
        api_key_env="OPENROUTER_API_KEY",
        base_url="https://openrouter.ai/api/v1",
        model_id="deepseek/deepseek-r1",
        cost_per_mtok=0.55,
        tier=ModelTier.DEEP_REASONING,
        capabilities=["deep_reasoning", "strategy", "code", "math", "architecture", "long_context"],
        max_tokens=8192,
        rpm_limit=None
    ),
    # DeepSeek-R1 Distill (Qwen 32B) — faster/cheaper reasoning variant
    "deepseek-r1-distill": SwarmModel(
        name="DeepSeek-R1 Distill 32B",
        provider="OpenRouter",
        api_key_env="OPENROUTER_API_KEY",
        base_url="https://openrouter.ai/api/v1",
        model_id="deepseek/deepseek-r1-distill-qwen-32b",
        cost_per_mtok=0.12,
        tier=ModelTier.DEEP_REASONING,
        capabilities=["deep_reasoning", "code", "math", "fast"],
        max_tokens=8192,
        rpm_limit=None
    ),
}

# ─────────────────────────────────────────────────────────────────────────────
# Task-type → model routing table
# ─────────────────────────────────────────────────────────────────────────────
TASK_ROUTING: Dict[str, str] = {
    # Computer use / screen understanding → Gemini 2.0 Flash (best-in-class vision)
    "computer_use":   "gemini-flash",
    "agentic_vision": "gemini-flash",
    "screen":         "gemini-flash",
    # Agent swarm / long-context tool-use → Kimi K2.5 Moonshot direct
    "agent_swarm":    "kimi-k2-moonshot",
    "tool_use":       "kimi-k2-moonshot",
    # Deep reasoning → DeepSeek-R1
    "deep_reasoning": "deepseek-r1",
    "strategy":       "deepseek-r1",
    # Fast bulk / code → Gemini 2.0 Flash (OpenRouter fallback)
    "bulk":           "gemini-flash",
    "fast":           "gemini-flash",
    "code":           "openrouter-gemini-flash",
}


class MultiModelSwarm:
    """
    Unified swarm interface for cost-effective parallel execution.

    Usage:
        swarm = MultiModelSwarm()

        # Direct model execution
        result = swarm.execute("Write a CSV parser", model="kimi-k2-moonshot")

        # Gemini 2.0 Flash (native, no OpenAI SDK overhead)
        result = swarm.execute("Explain this code", model="gemini-flash")

        # Computer use / agentic vision
        result = swarm.execute_computer_use("What app is open?", image_path="screen.png")

        # Auto-routing by task type
        result = swarm.route_by_task_type("Analyse this strategy", task_type="strategy")
    """

    def __init__(self):
        """Initialize swarm with available models."""
        self.clients: Dict[str, OpenAI] = {}
        self._gemini_client = None   # google.genai Client instance
        self._initialize_clients()
        self._initialize_gemini()

    # ─────────────────────────────────────────────────────────────────────────
    # Initialisation
    # ─────────────────────────────────────────────────────────────────────────

    def _initialize_gemini(self):
        """
        Initialise native Gemini API client via google-genai SDK.

        Key priority: GEMINI_API_KEY_NEW > GEMINI_API_KEY > GOOGLE_API_KEY
        Model: gemini-3-flash-preview (best for computer use + agentic vision).
        """
        if not GENAI_AVAILABLE:
            logger.warning(
                "google-genai SDK not available; Gemini native API disabled. "
                "Run: pip install google-genai"
            )
            return

        gemini_key = (
            os.getenv("GEMINI_API_KEY_NEW")
            or os.getenv("GEMINI_API_KEY")
            or os.getenv("GOOGLE_API_KEY")
        )
        if not gemini_key:
            logger.warning(
                "No Gemini API key found. "
                "Set GEMINI_API_KEY_NEW, GEMINI_API_KEY, or GOOGLE_API_KEY."
            )
            return

        try:
            self._gemini_client = genai.Client(api_key=gemini_key)
            logger.info("Gemini 2.0 Flash native client ready (GEMINI_API_KEY_NEW)")
        except Exception as e:
            logger.warning(f"Gemini native client init failed: {e}")

    def _initialize_clients(self):
        """Initialize OpenAI-compatible clients for available models."""
        for model_key, model in SWARM_MODELS.items():
            # gemini-flash uses native google-genai SDK — skip OpenAI compat init
            if model_key == "gemini-flash":
                continue

            if model.available:
                try:
                    self.clients[model_key] = OpenAI(
                        api_key=model.api_key,
                        base_url=model.base_url
                    )
                    logger.info(f"OK {model.name} available ({model.provider})")
                except Exception as e:
                    logger.warning(f"FAIL {model.name} failed to initialize: {e}")
            else:
                logger.debug(
                    f"SKIP {model.name} not available (no API key: {model.api_key_env})"
                )

    # ─────────────────────────────────────────────────────────────────────────
    # Properties
    # ─────────────────────────────────────────────────────────────────────────

    @property
    def available_models(self) -> List[str]:
        """Return all usable model keys (OpenAI-compat + native Gemini if ready)."""
        keys = list(self.clients.keys())
        if self._gemini_client is not None:
            keys.append("gemini-flash")
        return keys

    # ─────────────────────────────────────────────────────────────────────────
    # Core execution
    # ─────────────────────────────────────────────────────────────────────────

    def execute(
        self,
        task: str,
        model: str = "minimax-m21",
        system_prompt: str = "You are a helpful coding assistant.",
        temperature: float = 0.7,
        max_tokens: Optional[int] = None,
        stream: bool = False
    ) -> str:
        """
        Execute a task using the specified model.

        Args:
            task: Task description / prompt
            model: Model key from SWARM_MODELS
            system_prompt: System instruction
            temperature: Sampling temperature (0-1)
            max_tokens: Max output tokens (None = model default)
            stream: Stream response chunks (not supported for native Gemini)

        Returns:
            Generated text response
        """
        # Route gemini-flash to the native Google API handler
        if model == "gemini-flash":
            return self._execute_gemini_native(
                task=task,
                system_prompt=system_prompt,
                max_tokens=max_tokens
            )

        if model not in self.clients:
            available = ", ".join(self.available_models)
            raise ValueError(
                f"Model '{model}' not available. Available: {available}\n"
                f"Setup: export {SWARM_MODELS[model].api_key_env}='your_key'"
            )

        model_info = SWARM_MODELS[model]
        client = self.clients[model]

        logger.info(f"Executing task on {model_info.name} (${model_info.cost_per_mtok}/MTok)")

        # OpenRouter requires attribution headers
        extra_headers: Dict[str, str] = {}
        if model_info.provider == "OpenRouter":
            extra_headers = {
                "HTTP-Referer": "https://agileadapt.com",
                "X-Title": "Genesis System"
            }

        try:
            response = client.chat.completions.create(
                model=model_info.model_id,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": task}
                ],
                temperature=temperature,
                max_tokens=max_tokens or model_info.max_tokens,
                stream=stream,
                extra_headers=extra_headers or None
            )

            if stream:
                return response  # type: ignore[return-value]

            result = response.choices[0].message.content

            if hasattr(response, 'usage'):
                usage = response.usage
                cost = (usage.total_tokens / 1_000_000) * model_info.cost_per_mtok
                logger.info(
                    f"Tokens: {usage.total_tokens} | Cost: ${cost:.4f} | "
                    f"Model: {model_info.name}"
                )

            return result

        except Exception as e:
            logger.error(f"Error executing task on {model_info.name}: {e}")
            raise

    # ─────────────────────────────────────────────────────────────────────────
    # Native Gemini 2.0 Flash (google-genai SDK)
    # ─────────────────────────────────────────────────────────────────────────

    def _execute_gemini_native(
        self,
        task: str,
        system_prompt: str = "You are a helpful assistant.",
        max_tokens: Optional[int] = None,
        image_data: Optional[bytes] = None,
        image_mime: str = "image/png"
    ) -> str:
        """
        Execute via the native google-genai SDK using gemini-3-flash-preview.

        Falls back to OpenRouter gemini-flash if the native client is unavailable.

        Args:
            task: Prompt / question
            system_prompt: Prepended to the user prompt
            max_tokens: Maximum output tokens
            image_data: Raw image bytes for vision / computer-use tasks
            image_mime: MIME type of image (default: image/png)

        Returns:
            Generated text
        """
        if self._gemini_client is None:
            logger.warning(
                "Native Gemini client unavailable; falling back to openrouter-gemini-flash."
            )
            return self.execute(
                task=task,
                model="openrouter-gemini-flash",
                system_prompt=system_prompt,
                max_tokens=max_tokens
            )

        model_info = SWARM_MODELS["gemini-flash"]
        logger.info(
            f"Executing via Gemini 2.0 Flash native API (${model_info.cost_per_mtok}/MTok)"
        )

        try:
            config = None
            if max_tokens:
                config = genai_types.GenerateContentConfig(
                    max_output_tokens=max_tokens
                )

            full_prompt = f"{system_prompt}\n\n{task}" if system_prompt else task

            if image_data:
                # Vision / computer-use mode: pass image as inline Part
                contents = [
                    genai_types.Part.from_text(text=full_prompt),
                    genai_types.Part.from_bytes(data=image_data, mime_type=image_mime),
                ]
                response = self._gemini_client.models.generate_content(
                    model=model_info.model_id,
                    contents=contents,
                    config=config
                )
            else:
                response = self._gemini_client.models.generate_content(
                    model=model_info.model_id,
                    contents=full_prompt,
                    config=config
                )

            return response.text

        except Exception as e:
            logger.error(f"Gemini native API error: {e}")
            raise

    def execute_computer_use(
        self,
        task: str,
        image_path: Optional[str] = None,
        image_data: Optional[bytes] = None,
        system_prompt: str = (
            "You are a computer vision assistant with expertise in understanding "
            "screenshots, UI elements, and agentic web/desktop tasks. "
            "Describe precisely what you see and suggest the next action."
        )
    ) -> str:
        """
        Execute a computer use / agentic vision task via Gemini 2.0 Flash.

        Gemini 2.0 Flash is the recommended model for computer use —
        best-in-class vision understanding at ultra-low cost ($0.10/MTok).

        Args:
            task: Instruction/question about the screen content
            image_path: Path to screenshot file (PNG/JPEG)
            image_data: Raw image bytes (alternative to image_path)
            system_prompt: Vision-optimised system prompt

        Returns:
            Gemini's interpretation / next-action recommendation
        """
        import mimetypes

        raw_bytes = None
        mime = "image/png"

        if image_path:
            mime, _ = mimetypes.guess_type(image_path)
            mime = mime or "image/png"
            with open(image_path, "rb") as f:
                raw_bytes = f.read()
        elif image_data:
            raw_bytes = image_data

        logger.info("Routing computer_use task -> Gemini 2.0 Flash (agentic_vision)")
        return self._execute_gemini_native(
            task=task,
            system_prompt=system_prompt,
            image_data=raw_bytes,
            image_mime=mime
        )

    # ─────────────────────────────────────────────────────────────────────────
    # Task-type routing
    # ─────────────────────────────────────────────────────────────────────────

    def route_by_task_type(
        self,
        task: str,
        task_type: str,
        system_prompt: str = "You are a helpful assistant.",
        **kwargs
    ) -> str:
        """
        Execute a task using automatic model routing based on task_type.

        Routing table (TASK_ROUTING):
          computer_use / agentic_vision / screen  -> gemini-flash
          agent_swarm / tool_use                  -> kimi-k2-moonshot
          deep_reasoning / strategy               -> deepseek-r1
          bulk / fast / code                      -> gemini-flash / openrouter-gemini-flash

        Args:
            task: The task prompt
            task_type: Key in TASK_ROUTING (or any string; unknown defaults to gemini-flash)
            system_prompt: System instruction
            **kwargs: Additional args forwarded to execute()

        Returns:
            Generated response from the routed model
        """
        model = TASK_ROUTING.get(task_type, "gemini-flash")
        logger.info(f"route_by_task_type: '{task_type}' -> {model}")

        # Computer-use subtypes: call specialised handler
        if task_type in ("computer_use", "agentic_vision", "screen"):
            return self.execute_computer_use(task=task, system_prompt=system_prompt)

        return self.execute(task=task, model=model, system_prompt=system_prompt, **kwargs)

    # ─────────────────────────────────────────────────────────────────────────
    # Deep reasoning shortcut
    # ─────────────────────────────────────────────────────────────────────────

    def deep_reason(
        self,
        task: str,
        system_prompt: str = (
            "You are a deep reasoning AI. Think step by step. Be thorough and precise."
        ),
        fast: bool = False,
        max_tokens: int = 8192
    ) -> str:
        """
        Dispatch a deep reasoning task to DeepSeek-R1 via OpenRouter.

        DeepSeek-R1 provides o1-level reasoning at ~$0.55/MTok — 27x cheaper than Opus.
        Use for: strategy, architecture, multi-step analysis, complex planning.

        Args:
            task: The reasoning task or question
            system_prompt: Chain-of-thought system context
            fast: If True, use distilled 32B variant ($0.12/MTok)
            max_tokens: Max output tokens (default 8192)

        Returns:
            Reasoning output from DeepSeek-R1
        """
        model_key = "deepseek-r1-distill" if fast else "deepseek-r1"

        if model_key not in self.clients:
            raise ValueError(
                "DeepSeek-R1 not available. Ensure OPENROUTER_API_KEY is set."
            )

        logger.info(
            f"Deep reasoning via {'DeepSeek-R1 Distill 32B' if fast else 'DeepSeek-R1 671B'}"
        )
        return self.execute(
            task=task,
            model=model_key,
            system_prompt=system_prompt,
            temperature=0.0,
            max_tokens=max_tokens
        )

    # ─────────────────────────────────────────────────────────────────────────
    # Gemini CLI (Google AI Pro credits, $0 marginal cost)
    # ─────────────────────────────────────────────────────────────────────────

    def execute_gemini_cli(
        self,
        task: str,
        system_prompt: str = "You are a helpful coding assistant.",
        timeout: int = 180
    ) -> str:
        """Execute a task via Gemini CLI (uses Google AI Pro credits).

        Args:
            task: The task/prompt
            system_prompt: System context prepended to the task
            timeout: Maximum seconds to wait

        Returns:
            Generated text response from Gemini CLI
        """
        full_prompt = f"{system_prompt}\n\n{task}"
        logger.info("Executing task via Gemini CLI (Google AI Pro credits)")

        try:
            result = subprocess.run(
                ["gemini", "-p", full_prompt],
                capture_output=True,
                text=True,
                timeout=timeout,
                cwd="/mnt/e/genesis-system"
            )
            if result.returncode != 0:
                error_msg = (
                    result.stderr.strip()
                    or f"Gemini CLI exited with code {result.returncode}"
                )
                logger.error(f"Gemini CLI error: {error_msg}")
                raise RuntimeError(f"Gemini CLI failed: {error_msg}")

            output = result.stdout.strip()
            logger.info(f"Gemini CLI returned {len(output)} chars")
            return output

        except subprocess.TimeoutExpired:
            raise RuntimeError(f"Gemini CLI timed out after {timeout}s")
        except FileNotFoundError:
            raise RuntimeError(
                "Gemini CLI not found. Install: npm install -g @google/gemini-cli"
            )

    # ─────────────────────────────────────────────────────────────────────────
    # Parallel execution
    # ─────────────────────────────────────────────────────────────────────────

    def _execute_single(
        self,
        task_spec: Dict[str, Any],
        default_model: str
    ) -> Dict[str, Any]:
        """Execute a single task spec. Thread-safe; used by parallel_execute."""
        task = task_spec.get('task', '')
        model = task_spec.get('model', default_model)
        system_prompt = task_spec.get('system_prompt', 'You are a helpful assistant.')
        agent_id = task_spec.get('agent_id', 'unknown')
        task_label = task_spec.get('label', task[:80])

        try:
            if model == "gemini-cli":
                result = self.execute_gemini_cli(task, system_prompt=system_prompt)
            else:
                result = self.execute(task, model=model, system_prompt=system_prompt)

            logger.info(f"[{agent_id}] SUCCESS on {model} | {task_label}")
            return {
                'task': task_label,
                'result': result,
                'model': model,
                'status': 'success',
                'error': None,
                'agent_id': agent_id
            }
        except Exception as e:
            logger.error(f"[{agent_id}] FAILED on {model}: {e}")
            return {
                'task': task_label,
                'result': None,
                'model': model,
                'status': 'failed',
                'error': str(e),
                'agent_id': agent_id
            }

    def parallel_execute(
        self,
        tasks: List[Dict[str, Any]],
        default_model: str = "minimax-m21",
        max_workers: int = 20
    ) -> List[Dict[str, Any]]:
        """
        Execute multiple tasks in TRUE parallel using ThreadPoolExecutor.

        Args:
            tasks: List of task dicts — keys: 'task', 'model' (opt),
                   'system_prompt' (opt), 'agent_id' (opt), 'label' (opt)
            default_model: Default model when task dict omits 'model'
            max_workers: Max concurrent threads

        Returns:
            List of result dicts: 'task', 'result', 'model', 'status', 'error', 'agent_id'
        """
        results = []
        total = len(tasks)
        logger.info(f"Launching {total} agents in parallel (max_workers={max_workers})")
        start_time = time.time()

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_task = {
                executor.submit(self._execute_single, task_spec, default_model): task_spec
                for task_spec in tasks
            }

            completed = 0
            for future in as_completed(future_to_task):
                completed += 1
                result = future.result()
                results.append(result)
                status_icon = "OK" if result['status'] == 'success' else "FAIL"
                logger.info(
                    f"[{completed}/{total}] {status_icon} | "
                    f"{result.get('agent_id', '?')} | {result['model']}"
                )

        elapsed = time.time() - start_time
        successes = sum(1 for r in results if r['status'] == 'success')
        logger.info(
            f"Parallel execution complete: {successes}/{total} succeeded in {elapsed:.1f}s"
        )
        return results

    def save_results(
        self,
        results: List[Dict[str, Any]],
        output_dir: str
    ) -> str:
        """Save each agent's output to individual markdown files in output_dir.

        Returns path to the _summary.json file.
        """
        os.makedirs(output_dir, exist_ok=True)

        summary: Dict[str, Any] = {
            "total": len(results),
            "successes": 0,
            "failures": 0,
            "agents": []
        }

        for result in results:
            agent_id = result.get('agent_id', 'unknown')
            model = result.get('model', 'unknown').replace('/', '_')
            status = result.get('status', 'unknown')

            safe_id = agent_id.replace('/', '_').replace(' ', '_')
            filename = f"{safe_id}_{model}.md"
            filepath = os.path.join(output_dir, filename)

            content = result.get('result', '')
            if content:
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(f"# Agent: {agent_id}\n")
                    f.write(f"# Model: {result.get('model', 'unknown')}\n")
                    f.write(f"# Status: {status}\n")
                    f.write(f"# Task: {result.get('task', 'unknown')}\n\n")
                    f.write(content)
                summary["successes"] += 1
            else:
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(f"# Agent: {agent_id}\n")
                    f.write(f"# Model: {result.get('model', 'unknown')}\n")
                    f.write(f"# Status: FAILED\n")
                    f.write(f"# Error: {result.get('error', 'unknown')}\n")
                summary["failures"] += 1

            summary["agents"].append({
                "agent_id": agent_id,
                "model": result.get('model', 'unknown'),
                "status": status,
                "file": filename,
                "error": result.get('error')
            })

        summary_path = os.path.join(output_dir, "_summary.json")
        with open(summary_path, 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2)

        logger.info(
            f"Results saved to {output_dir}: "
            f"{summary['successes']} succeeded, {summary['failures']} failed"
        )
        return summary_path

    # ─────────────────────────────────────────────────────────────────────────
    # Utilities
    # ─────────────────────────────────────────────────────────────────────────

    def recommend_model(
        self,
        task_type: Literal[
            "code", "research", "vision", "reasoning", "fast",
            "computer_use", "agentic_vision", "agent_swarm"
        ],
        priority: Literal["cost", "quality", "speed"] = "cost"
    ) -> Optional[str]:
        """
        Recommend the best available model for a task type.

        Hard overrides:
          computer_use / agentic_vision -> gemini-flash (Gemini 2.0 Flash native)
          agent_swarm                   -> kimi-k2-moonshot

        Args:
            task_type: Descriptor of the task
            priority: Ranking criterion — cost, quality, or speed

        Returns:
            Model key string, or None if nothing matches
        """
        if task_type in ("computer_use", "agentic_vision", "screen"):
            return "gemini-flash" if self._gemini_client else "openrouter-gemini-flash"
        if task_type == "agent_swarm":
            if "kimi-k2-moonshot" in self.clients:
                return "kimi-k2-moonshot"
            if "kimi-k25-official" in self.clients:
                return "kimi-k25-official"

        candidates = [
            (key, model) for key, model in SWARM_MODELS.items()
            if task_type in model.capabilities and (
                key in self.clients
                or (key == "gemini-flash" and self._gemini_client is not None)
            )
        ]

        if not candidates:
            return None

        if priority == "cost":
            candidates.sort(key=lambda x: x[1].cost_per_mtok)
        elif priority == "quality":
            candidates.sort(key=lambda x: -x[1].cost_per_mtok)
        elif priority == "speed":
            candidates.sort(key=lambda x: x[1].max_tokens)

        return candidates[0][0] if candidates else None

    def get_stats(self) -> Dict[str, Any]:
        """Return swarm statistics including routing table and Gemini native status."""
        available_keys = self.available_models
        total_models = len(SWARM_MODELS)

        cost_breakdown = {
            SWARM_MODELS[k].name: f"${SWARM_MODELS[k].cost_per_mtok}/MTok"
            for k in available_keys
            if k in SWARM_MODELS
        }

        all_costs = [
            (k, SWARM_MODELS[k].cost_per_mtok)
            for k in available_keys
            if k in SWARM_MODELS
        ]

        return {
            "available_models": len(available_keys),
            "total_models": total_models,
            "availability_rate": (
                f"{(len(available_keys) / total_models * 100):.1f}%"
                if total_models else "0%"
            ),
            "models": cost_breakdown,
            "cheapest": min(all_costs, key=lambda x: x[1], default=(None, None))[0],
            "gemini_native": self._gemini_client is not None,
            "routing_table": TASK_ROUTING,
        }


# ─────────────────────────────────────────────────────────────────────────────
# CLI test entry-point
# ─────────────────────────────────────────────────────────────────────────────

def main():
    """Test script."""
    print("=" * 80)
    print("GENESIS MULTI-MODEL SWARM")
    print("=" * 80)
    print()

    swarm = MultiModelSwarm()

    print("Available Models:")
    for model_key in swarm.available_models:
        if model_key in SWARM_MODELS:
            m = SWARM_MODELS[model_key]
            print(f"  OK {m.name} - ${m.cost_per_mtok}/MTok ({m.provider})")
        else:
            print(f"  OK {model_key}")
    print()

    if not swarm.available_models:
        print("WARNING: No models available. Set API keys:")
        for key, model in SWARM_MODELS.items():
            print(f"  export {model.api_key_env}='your_key'  # {model.name}")
        return

    stats = swarm.get_stats()
    print(f"Stats: {stats['available_models']}/{stats['total_models']} models available")
    print(f"Cheapest: {stats['cheapest']}")
    print(f"Gemini native: {stats['gemini_native']}")
    print()
    print("Routing table:")
    for task_type, model_key in TASK_ROUTING.items():
        print(f"  {task_type:20s} -> {model_key}")
    print()

    test_model = swarm.available_models[0]
    model_name = (
        SWARM_MODELS[test_model].name if test_model in SWARM_MODELS else test_model
    )
    print(f"Testing {model_name}...")

    try:
        result = swarm.execute(
            "Write a one-sentence explanation of async/await in Python.",
            model=test_model,
            max_tokens=100
        )
        print(f"Result: {result}")
    except Exception as e:
        print(f"Error: {e}")

    print()
    print("=" * 80)


if __name__ == "__main__":
    main()