#!/usr/bin/env python3
"""
GENESIS QWEN RATE MAXIMIZER
============================
Rate limit management for Qwen/AIVA Ollama endpoint.

Unlike Gemini (multiple models), Qwen has a single model on dedicated hardware.
Focus is on:
- Queue management with priority
- AIVA-first access (30% reservation)
- Cold start prevention
- Concurrent request limiting

Usage:
    from core.qwen.rate_maximizer import QwenRateMaximizer

    maximizer = QwenRateMaximizer()
    if maximizer.can_execute(priority=1):  # AIVA priority
        # Execute request
        maximizer.record_usage(tokens=100, duration=1.5)
"""

import threading
import time
from collections import deque
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, Any, Optional, Tuple, Deque
from queue import PriorityQueue

from .config import QwenConfig


@dataclass
class UsageWindow:
    """Sliding window for tracking usage over time."""

    requests: Deque[float] = field(default_factory=deque)  # timestamps
    tokens: Deque[Tuple[float, int]] = field(default_factory=deque)  # (timestamp, count)
    daily_requests: int = 0
    daily_tokens: int = 0
    daily_reset_time: float = field(default_factory=time.time)

    def cleanup(self, window_seconds: int = 60):
        """Remove entries older than window."""
        cutoff = time.time() - window_seconds

        while self.requests and self.requests[0] < cutoff:
            self.requests.popleft()

        while self.tokens and self.tokens[0][0] < cutoff:
            self.tokens.popleft()

        # Daily reset
        if time.time() - self.daily_reset_time > 86400:
            self.daily_requests = 0
            self.daily_tokens = 0
            self.daily_reset_time = time.time()


@dataclass
class QueuedRequest:
    """Request waiting in priority queue."""

    priority: int  # 1=AIVA, 5=normal, 10=background
    timestamp: float
    request_id: str

    def __lt__(self, other):
        # Lower priority number = higher priority
        if self.priority != other.priority:
            return self.priority < other.priority
        return self.timestamp < other.timestamp


class QwenRateMaximizer:
    """
    Rate limit management for Qwen/AIVA Ollama.

    Manages:
    - RPM (requests per minute) limits
    - TPM (tokens per minute) limits
    - AIVA priority reservation
    - Request queuing
    """

    _instance: Optional["QwenRateMaximizer"] = None
    _lock = threading.RLock()

    def __new__(cls) -> "QwenRateMaximizer":
        """Singleton pattern."""
        with cls._lock:
            if cls._instance is None:
                cls._instance = super().__new__(cls)
                cls._instance._initialized = False
            return cls._instance

    def __init__(self, config: QwenConfig = None):
        if self._initialized:
            return

        with self._lock:
            if self._initialized:
                return

            self.config = config or QwenConfig()
            self._usage_window = UsageWindow()
            self._pending_queue: PriorityQueue = PriorityQueue()
            self._initialized = True

            # AIVA gets dedicated capacity
            self.aiva_reservation = self.config.aiva_reservation_percent / 100.0

    def _get_current_usage(self) -> Dict[str, Any]:
        """Get current usage within the window."""
        with self._lock:
            self._usage_window.cleanup()

            rpm_used = len(self._usage_window.requests)
            tpm_used = sum(t[1] for t in self._usage_window.tokens)

            return {
                "rpm_used": rpm_used,
                "rpm_limit": self.config.rpm,
                "rpm_available": max(0, self.config.rpm - rpm_used),
                "tpm_used": tpm_used,
                "tpm_limit": self.config.tpm,
                "tpm_available": max(0, self.config.tpm - tpm_used),
                "utilization": rpm_used / self.config.rpm if self.config.rpm > 0 else 0,
            }

    def can_execute(self, priority: int = 5, estimated_tokens: int = 1000) -> Tuple[bool, str]:
        """
        Check if request can execute now.

        Args:
            priority: 1=AIVA priority, 5=normal, 10=background
            estimated_tokens: Estimated tokens for the request

        Returns:
            Tuple of (can_execute, reason)
        """
        with self._lock:
            usage = self._get_current_usage()

            # AIVA priority requests get reserved capacity
            if priority == 1:
                # AIVA can use up to 100% of capacity
                rpm_limit = self.config.rpm
                tpm_limit = self.config.tpm
            else:
                # Non-AIVA gets (1 - reservation) of capacity
                rpm_limit = int(self.config.rpm * (1 - self.aiva_reservation))
                tpm_limit = int(self.config.tpm * (1 - self.aiva_reservation))

            # Check RPM
            if usage["rpm_used"] >= rpm_limit:
                return False, f"RPM limit reached ({usage['rpm_used']}/{rpm_limit})"

            # Check TPM
            if usage["tpm_used"] + estimated_tokens > tpm_limit:
                return False, f"TPM limit would be exceeded ({usage['tpm_used']}/{tpm_limit})"

            return True, "OK"

    def get_wait_time(self, priority: int = 5) -> float:
        """
        Get wait time before next execution slot.

        Returns:
            Seconds to wait (0 if can execute now)
        """
        with self._lock:
            usage = self._get_current_usage()

            if priority == 1:
                rpm_limit = self.config.rpm
            else:
                rpm_limit = int(self.config.rpm * (1 - self.aiva_reservation))

            if usage["rpm_used"] < rpm_limit:
                return 0.0

            # Calculate when oldest request will expire
            if self._usage_window.requests:
                oldest = self._usage_window.requests[0]
                wait = 60.0 - (time.time() - oldest)
                return max(0.0, wait)

            return 0.0

    def record_usage(self, tokens: int, duration: float):
        """
        Record completed request usage.

        Args:
            tokens: Tokens used in the request
            duration: Request duration in seconds
        """
        with self._lock:
            now = time.time()
            self._usage_window.requests.append(now)
            self._usage_window.tokens.append((now, tokens))
            self._usage_window.daily_requests += 1
            self._usage_window.daily_tokens += tokens

    def get_utilization_report(self) -> Dict[str, Any]:
        """Get comprehensive utilization report."""
        with self._lock:
            usage = self._get_current_usage()

            return {
                "timestamp": datetime.now().isoformat(),
                "current_usage": usage,
                "aiva_reservation_percent": self.aiva_reservation * 100,
                "daily_stats": {
                    "requests": self._usage_window.daily_requests,
                    "tokens": self._usage_window.daily_tokens,
                },
                "recommendations": self._get_recommendations(usage),
            }

    def _get_recommendations(self, usage: Dict) -> list:
        """Generate recommendations based on current usage."""
        recs = []

        if usage["utilization"] < 0.3:
            recs.append("Capacity underutilized - consider batching requests")
        elif usage["utilization"] > 0.9:
            recs.append("Near capacity - consider request throttling")

        if usage["rpm_available"] < 2:
            recs.append("Low RPM headroom - queue non-priority requests")

        return recs

    @classmethod
    def reset_singleton(cls):
        """Reset singleton (for testing)."""
        with cls._lock:
            cls._instance = None


# VERIFICATION_STAMP
# Story: STORY-005
# Verified By: CLAUDE
# Verified At: 2026-01-22
# Tests: Pending
# Coverage: Pending
