#!/usr/bin/env python3
"""
GENESIS GEMINI RATE MAXIMIZER
==============================
Intelligent rate limit tracking and request scheduling to maximize
utilization of Gemini API credits while staying just under limits.

Features:
- Multi-model load balancing based on current utilization
- Sliding window tracking for RPM/TPM/RPD
- Predictive scheduling to hit 90-95% of limits
- Automatic failover when models hit limits
- Burst detection and proactive throttling

Usage:
    maximizer = GeminiRateMaximizer()

    # Get best available model
    model = maximizer.get_best_model(token_estimate=1000)

    # Record usage after request
    maximizer.record_usage(model, input_tokens=500, output_tokens=1000)

    # Get utilization report
    report = maximizer.get_utilization_report()
"""

import json
import time
import threading
from collections import deque
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple
from enum import Enum


class TaskType(Enum):
    """Task types for intelligent routing."""
    RESEARCH = "research"
    CODE_GENERATION = "code_generation"
    CODE_REVIEW = "code_review"
    ARCHITECTURE = "architecture"
    SIMPLE_EXTRACTION = "simple_extraction"
    CLASSIFICATION = "classification"
    SUMMARIZATION = "summarization"
    GENERAL = "general"


@dataclass
class ModelLimits:
    """Rate limits for a single model."""
    rpm: int  # Requests per minute
    tpm: int  # Tokens per minute
    rpd: int  # Requests per day (-1 = unlimited)
    priority: int
    cost_per_million_input: float
    cost_per_million_output: float
    use_cases: List[str] = field(default_factory=list)


@dataclass
class UsageWindow:
    """Sliding window for tracking usage."""
    requests: deque = field(default_factory=lambda: deque(maxlen=10000))
    tokens: deque = field(default_factory=lambda: deque(maxlen=10000))
    daily_requests: int = 0
    daily_tokens: int = 0
    last_daily_reset: float = field(default_factory=time.time)

    def cleanup_minute_window(self):
        """Remove entries older than 1 minute."""
        cutoff = time.time() - 60
        while self.requests and self.requests[0][0] < cutoff:
            self.requests.popleft()
        while self.tokens and self.tokens[0][0] < cutoff:
            self.tokens.popleft()

    def check_daily_reset(self):
        """Reset daily counters at midnight Pacific."""
        now = time.time()
        # Simple check: reset if more than 24 hours since last reset
        if now - self.last_daily_reset > 86400:
            self.daily_requests = 0
            self.daily_tokens = 0
            self.last_daily_reset = now

    def get_rpm(self) -> int:
        """Get current requests per minute."""
        self.cleanup_minute_window()
        return len(self.requests)

    def get_tpm(self) -> int:
        """Get current tokens per minute."""
        self.cleanup_minute_window()
        return sum(t[1] for t in self.tokens)

    def record(self, tokens: int):
        """Record a request with token count."""
        now = time.time()
        self.requests.append((now,))
        self.tokens.append((now, tokens))
        self.daily_requests += 1
        self.daily_tokens += tokens
        self.check_daily_reset()


@dataclass
class ScheduledRequest:
    """A request scheduled for execution."""
    model: str
    delay_seconds: float
    reason: str
    utilization: float


@dataclass
class UtilizationReport:
    """Utilization report for all models."""
    timestamp: str
    models: Dict[str, Dict[str, Any]]
    best_model: str
    total_capacity_used: float
    recommendations: List[str]


class GeminiRateMaximizer:
    """
    Intelligent Gemini API rate limit maximizer.

    Tracks usage across all models and routes requests to maximize
    throughput while staying within limits.
    """

    CONFIG_PATH = Path("E:/genesis-system/config/gemini_rate_limits.json")
    USAGE_LOG_PATH = Path("E:/genesis-system/data/rate_maximizer_usage.jsonl")

    def __init__(self, config_path: Path = None):
        self.config_path = config_path or self.CONFIG_PATH
        self.config = self._load_config()
        self.models: Dict[str, ModelLimits] = {}
        self.usage_windows: Dict[str, UsageWindow] = {}
        self._lock = threading.RLock()

        self._initialize_models()

    def _load_config(self) -> Dict:
        """Load configuration from JSON file."""
        if self.config_path.exists():
            with open(self.config_path) as f:
                return json.load(f)
        return self._default_config()

    def _default_config(self) -> Dict:
        """Default configuration if file not found."""
        return {
            "target_utilization": 0.90,
            "safety_margin": 0.05,
            "models": {
                "gemini-2.0-flash": {"rpm": 2000, "tpm": 4000000, "rpd": -1, "priority": 1},
                "gemini-2.5-flash": {"rpm": 1000, "tpm": 1000000, "rpd": 10000, "priority": 2},
            }
        }

    def _initialize_models(self):
        """Initialize model limits from config."""
        for model_name, model_config in self.config.get("models", {}).items():
            self.models[model_name] = ModelLimits(
                rpm=model_config.get("rpm", 100),
                tpm=model_config.get("tpm", 100000),
                rpd=model_config.get("rpd", -1),
                priority=model_config.get("priority", 10),
                cost_per_million_input=model_config.get("cost_per_million_input", 0.10),
                cost_per_million_output=model_config.get("cost_per_million_output", 0.40),
                use_cases=model_config.get("use_cases", ["general"])
            )
            self.usage_windows[model_name] = UsageWindow()

    def get_model_utilization(self, model: str) -> Dict[str, float]:
        """
        Get current utilization percentages for a model.

        Returns:
            Dict with rpm_util, tpm_util, rpd_util percentages
        """
        with self._lock:
            if model not in self.models:
                return {"rpm_util": 0, "tpm_util": 0, "rpd_util": 0, "max_util": 0}

            limits = self.models[model]
            window = self.usage_windows[model]

            rpm_util = window.get_rpm() / limits.rpm if limits.rpm > 0 else 0
            tpm_util = window.get_tpm() / limits.tpm if limits.tpm > 0 else 0

            if limits.rpd > 0:
                rpd_util = window.daily_requests / limits.rpd
            else:
                rpd_util = 0  # Unlimited

            return {
                "rpm_util": rpm_util,
                "tpm_util": tpm_util,
                "rpd_util": rpd_util,
                "max_util": max(rpm_util, tpm_util, rpd_util),
                "current_rpm": window.get_rpm(),
                "current_tpm": window.get_tpm(),
                "daily_requests": window.daily_requests
            }

    def get_available_capacity(self, model: str) -> Dict[str, int]:
        """
        Get available capacity for a model.

        Returns remaining RPM, TPM, and RPD before hitting limits.
        """
        with self._lock:
            if model not in self.models:
                return {"rpm_available": 0, "tpm_available": 0, "rpd_available": 0}

            limits = self.models[model]
            window = self.usage_windows[model]
            target = self.config.get("target_utilization", 0.90)

            rpm_available = int(limits.rpm * target) - window.get_rpm()
            tpm_available = int(limits.tpm * target) - window.get_tpm()

            if limits.rpd > 0:
                rpd_available = int(limits.rpd * target) - window.daily_requests
            else:
                rpd_available = float('inf')

            return {
                "rpm_available": max(0, rpm_available),
                "tpm_available": max(0, tpm_available),
                "rpd_available": max(0, int(rpd_available)) if rpd_available != float('inf') else -1
            }

    def get_best_model(
        self,
        token_estimate: int = 1000,
        task_type: TaskType = TaskType.GENERAL
    ) -> str:
        """
        Select the best available model based on current utilization.

        Args:
            token_estimate: Estimated tokens for the request
            task_type: Type of task for intelligent routing

        Returns:
            Model name with most available capacity
        """
        with self._lock:
            # First, try task-specific routing
            task_routing = self.config.get("routing_rules", {}).get("task_routing", {})
            if task_type.value in task_routing:
                preferred = task_routing[task_type.value]
                capacity = self.get_available_capacity(preferred)
                if capacity["rpm_available"] > 0 and capacity["tpm_available"] >= token_estimate:
                    return preferred

            # Fall back to model with most capacity
            best_model = None
            best_score = -1

            # Sort by priority
            sorted_models = sorted(
                self.models.items(),
                key=lambda x: x[1].priority
            )

            for model_name, limits in sorted_models:
                capacity = self.get_available_capacity(model_name)

                # Skip if insufficient capacity
                if capacity["rpm_available"] <= 0:
                    continue
                if capacity["tpm_available"] < token_estimate:
                    continue
                if capacity["rpd_available"] == 0:
                    continue

                # Score based on available capacity and priority
                # Lower priority number = higher preference
                priority_bonus = (10 - limits.priority) * 100
                capacity_score = (
                    capacity["rpm_available"] * 0.5 +
                    capacity["tpm_available"] / 1000 * 0.3 +
                    (capacity["rpd_available"] if capacity["rpd_available"] > 0 else 10000) * 0.2
                )

                score = priority_bonus + capacity_score

                if score > best_score:
                    best_score = score
                    best_model = model_name

            # Fallback to default
            if best_model is None:
                best_model = self.config.get("routing_rules", {}).get(
                    "default_model", "gemini-2.0-flash"
                )

            return best_model

    def record_usage(
        self,
        model: str,
        input_tokens: int,
        output_tokens: int,
        success: bool = True
    ):
        """
        Record API usage for a model.

        Args:
            model: Model name
            input_tokens: Number of input tokens
            output_tokens: Number of output tokens
            success: Whether the request succeeded
        """
        with self._lock:
            if model not in self.usage_windows:
                self.usage_windows[model] = UsageWindow()

            total_tokens = input_tokens + output_tokens
            self.usage_windows[model].record(total_tokens)

            # Log to file
            self._log_usage(model, input_tokens, output_tokens, success)

    def _log_usage(
        self,
        model: str,
        input_tokens: int,
        output_tokens: int,
        success: bool
    ):
        """Log usage to JSONL file."""
        self.USAGE_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

        entry = {
            "timestamp": datetime.now().isoformat(),
            "model": model,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "total_tokens": input_tokens + output_tokens,
            "success": success,
            "utilization": self.get_model_utilization(model)
        }

        with open(self.USAGE_LOG_PATH, "a") as f:
            f.write(json.dumps(entry) + "\n")

    def schedule_request(
        self,
        token_estimate: int = 1000,
        task_type: TaskType = TaskType.GENERAL,
        priority: int = 5
    ) -> ScheduledRequest:
        """
        Schedule a request with optimal timing and model selection.

        Args:
            token_estimate: Estimated tokens for the request
            task_type: Type of task
            priority: Request priority (1-10, lower = higher priority)

        Returns:
            ScheduledRequest with model, delay, and reason
        """
        with self._lock:
            best_model = self.get_best_model(token_estimate, task_type)
            capacity = self.get_available_capacity(best_model)
            utilization = self.get_model_utilization(best_model)

            # Calculate delay if needed
            delay = 0.0
            reason = "immediate"

            if capacity["rpm_available"] <= 0:
                # Need to wait for RPM window to slide
                delay = 60.0 / self.models[best_model].rpm
                reason = "rpm_throttle"
            elif capacity["tpm_available"] < token_estimate:
                # Need to wait for TPM window
                delay = 60.0 * (token_estimate - capacity["tpm_available"]) / self.models[best_model].tpm
                reason = "tpm_throttle"
            elif utilization["max_util"] > self.config.get("burst_detection_threshold", 0.85):
                # Approaching limits, slight delay
                delay = 0.5
                reason = "burst_prevention"

            return ScheduledRequest(
                model=best_model,
                delay_seconds=delay,
                reason=reason,
                utilization=utilization["max_util"]
            )

    def get_utilization_report(self) -> UtilizationReport:
        """
        Generate comprehensive utilization report.

        Returns:
            UtilizationReport with all model stats and recommendations
        """
        with self._lock:
            models_report = {}
            total_capacity = 0
            total_used = 0

            for model_name in self.models:
                util = self.get_model_utilization(model_name)
                capacity = self.get_available_capacity(model_name)
                limits = self.models[model_name]

                models_report[model_name] = {
                    "utilization": util,
                    "capacity": capacity,
                    "limits": {
                        "rpm": limits.rpm,
                        "tpm": limits.tpm,
                        "rpd": limits.rpd
                    },
                    "priority": limits.priority
                }

                # Aggregate capacity
                total_capacity += limits.rpm
                total_used += util["current_rpm"]

            # Generate recommendations
            recommendations = []

            # Find underutilized models
            for model_name, report in models_report.items():
                if report["utilization"]["max_util"] < 0.5:
                    recommendations.append(
                        f"Model {model_name} is underutilized ({report['utilization']['max_util']:.1%}). "
                        f"Consider routing more requests here."
                    )

            # Find overutilized models
            for model_name, report in models_report.items():
                if report["utilization"]["max_util"] > 0.95:
                    recommendations.append(
                        f"Model {model_name} is near capacity ({report['utilization']['max_util']:.1%}). "
                        f"Consider load balancing to other models."
                    )

            # Overall utilization
            overall_util = total_used / total_capacity if total_capacity > 0 else 0
            if overall_util < 0.5:
                recommendations.append(
                    f"Overall utilization is low ({overall_util:.1%}). "
                    f"Enable research tasks to fill idle capacity."
                )

            return UtilizationReport(
                timestamp=datetime.now().isoformat(),
                models=models_report,
                best_model=self.get_best_model(),
                total_capacity_used=overall_util,
                recommendations=recommendations
            )

    def can_execute_research(self) -> Tuple[bool, str]:
        """
        Check if there's capacity for research tasks.

        Returns:
            (can_execute, reason)
        """
        research_config = self.config.get("research_config", {})
        if not research_config.get("enabled", True):
            return False, "research_disabled"

        min_capacity = research_config.get("min_capacity_for_research", 0.20)

        # Check best model capacity
        best_model = self.get_best_model()
        util = self.get_model_utilization(best_model)

        available = 1.0 - util["max_util"]

        if available >= min_capacity:
            return True, f"capacity_available_{available:.1%}"
        else:
            return False, f"insufficient_capacity_{available:.1%}"

    def get_research_budget(self) -> Dict[str, int]:
        """
        Calculate how many research requests can be made.

        Returns:
            Dict with requests_available and tokens_available
        """
        research_config = self.config.get("research_config", {})
        max_percentage = research_config.get("max_research_percentage", 0.30)

        best_model = self.get_best_model(task_type=TaskType.RESEARCH)
        capacity = self.get_available_capacity(best_model)

        return {
            "model": best_model,
            "requests_available": int(capacity["rpm_available"] * max_percentage),
            "tokens_available": int(capacity["tpm_available"] * max_percentage)
        }


def main():
    """CLI for rate maximizer."""
    import argparse

    parser = argparse.ArgumentParser(description="Gemini Rate Maximizer")
    parser.add_argument("command", choices=["status", "best", "research", "demo"])
    parser.add_argument("--tokens", type=int, default=1000, help="Token estimate")
    parser.add_argument("--task", type=str, default="general", help="Task type")
    args = parser.parse_args()

    maximizer = GeminiRateMaximizer()

    if args.command == "status":
        report = maximizer.get_utilization_report()
        print(f"\n{'='*60}")
        print(f"GEMINI RATE MAXIMIZER STATUS - {report.timestamp}")
        print(f"{'='*60}\n")

        print(f"Best Model: {report.best_model}")
        print(f"Total Capacity Used: {report.total_capacity_used:.1%}\n")

        print("Model Utilization:")
        print("-" * 40)
        for model, data in report.models.items():
            util = data["utilization"]
            print(f"  {model}:")
            print(f"    RPM: {util['current_rpm']}/{data['limits']['rpm']} ({util['rpm_util']:.1%})")
            print(f"    TPM: {util['current_tpm']}/{data['limits']['tpm']} ({util['tpm_util']:.1%})")
            print(f"    RPD: {util['daily_requests']}/{data['limits']['rpd']}")
            print()

        if report.recommendations:
            print("Recommendations:")
            print("-" * 40)
            for rec in report.recommendations:
                print(f"  - {rec}")

    elif args.command == "best":
        task_type = TaskType(args.task) if args.task != "general" else TaskType.GENERAL
        model = maximizer.get_best_model(args.tokens, task_type)
        capacity = maximizer.get_available_capacity(model)

        print(f"\nBest Model for {args.tokens} tokens ({args.task}):")
        print(f"  Model: {model}")
        print(f"  Available RPM: {capacity['rpm_available']}")
        print(f"  Available TPM: {capacity['tpm_available']}")

    elif args.command == "research":
        can_execute, reason = maximizer.can_execute_research()
        budget = maximizer.get_research_budget()

        print(f"\nResearch Capacity:")
        print(f"  Can Execute: {can_execute} ({reason})")
        print(f"  Model: {budget['model']}")
        print(f"  Requests Available: {budget['requests_available']}")
        print(f"  Tokens Available: {budget['tokens_available']}")

    elif args.command == "demo":
        print("\n=== RATE MAXIMIZER DEMO ===\n")

        # Simulate some usage
        print("Simulating 10 requests...")
        for i in range(10):
            scheduled = maximizer.schedule_request(token_estimate=500)
            print(f"  Request {i+1}: {scheduled.model} (delay: {scheduled.delay_seconds:.2f}s, reason: {scheduled.reason})")

            # Simulate the request
            maximizer.record_usage(scheduled.model, input_tokens=200, output_tokens=300)

        print("\n" + "="*40)
        report = maximizer.get_utilization_report()
        print(f"Final utilization: {report.total_capacity_used:.1%}")


if __name__ == "__main__":
    main()
