#!/usr/bin/env python3
"""
GENESIS UNIFIED QWEN CLIENT
============================
THE single Qwen client for Genesis - replaces ALL fragmented implementations.

Deprecated implementations (DO NOT USE):
- /qwen_client.py (wrong port localhost:11434)
- /AIVA/qwen-unified/qwen_client.py (Elestio API)
- /AIVA/qwen-unified/qwen_client_v2.py (localhost again)
- /model_switcher.py (Dashscope SDK)

Features:
- Correct endpoint: 152.53.201.152:23405
- Singleton pattern (thread-safe)
- Circuit breaker via RetryManager
- JSONL usage logging
- Both sync and async interfaces (async in STORY-004)

Usage:
    from core.qwen import UnifiedQwenClient

    client = UnifiedQwenClient()
    response = client.generate_sync("What is 2+2?")
    print(response.text)

Author: Genesis System
Version: 1.0.0
"""

import asyncio
import json
import threading
import time
import urllib.request
import urllib.error
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Optional, List

# Try to import httpx for async, fallback to sync if not available
try:
    import httpx
    HTTPX_AVAILABLE = True
except ImportError:
    HTTPX_AVAILABLE = False

from .config import QwenConfig
from .exceptions import (
    QwenConnectionError,
    QwenTimeoutError,
    QwenModelNotFoundError,
    QwenResponseError,
    QwenCircuitOpenError,
)

# Import retry manager for circuit breaker
import sys
sys.path.insert(0, "/mnt/e/genesis-system")
from core.retry_manager import CircuitBreaker, CircuitState, RetryConfig, RetryStrategy


@dataclass
class QwenResponse:
    """Response from Qwen generation."""

    text: str
    model: str
    tokens_used: int = 0
    prompt_tokens: int = 0
    completion_tokens: int = 0
    execution_time: float = 0.0
    success: bool = True
    thinking: Optional[str] = None  # For models that output thinking
    raw_response: Dict[str, Any] = field(default_factory=dict)

    @property
    def cost_estimate(self) -> float:
        """Estimate cost (Ollama is free, but track for metrics)."""
        return 0.0  # Local Ollama is free


class UnifiedQwenClient:
    """
    THE unified Qwen client for Genesis.

    This is a singleton - only one instance exists per process.
    Thread-safe for concurrent access.
    """

    _instance: Optional["UnifiedQwenClient"] = None
    _lock = threading.RLock()

    def __new__(cls) -> "UnifiedQwenClient":
        """Singleton pattern - return existing instance if exists."""
        with cls._lock:
            if cls._instance is None:
                cls._instance = super().__new__(cls)
                cls._instance._initialized = False
            return cls._instance

    def __init__(self):
        """Initialize client (only runs once due to singleton)."""
        if self._initialized:
            return

        with self._lock:
            if self._initialized:
                return

            self.config = QwenConfig()
            self._initialized = True

            # Initialize circuit breaker
            cb_config = self.config.get_circuit_breaker_config()
            self._circuit_breaker = CircuitBreaker(
                name="qwen_ollama",
                failure_threshold=cb_config["failure_threshold"],
                recovery_timeout=cb_config["recovery_timeout"],
                half_open_max_calls=cb_config["half_open_max_calls"],
            )

            # Retry configuration for exponential backoff
            self._retry_config = RetryConfig(
                max_attempts=3,
                base_delay=1.0,
                max_delay=30.0,
                strategy=RetryStrategy.EXPONENTIAL,
                exponential_base=2.0,
                jitter=True,
            )

            # Ensure log directory exists
            self.config.usage_log_path.parent.mkdir(parents=True, exist_ok=True)

    def _make_request(
        self,
        url: str,
        data: Dict[str, Any],
        timeout: float = None,
        bypass_circuit_breaker: bool = False,
    ) -> Dict[str, Any]:
        """
        Make HTTP request to Ollama API with circuit breaker protection.

        Args:
            url: API endpoint URL
            data: Request body as dict
            timeout: Request timeout in seconds
            bypass_circuit_breaker: If True, skip circuit breaker check (for health checks)

        Returns:
            Response JSON as dict

        Raises:
            QwenCircuitOpenError: If circuit breaker is open
            QwenConnectionError: If cannot connect
            QwenTimeoutError: If request times out
            QwenResponseError: If response is invalid
        """
        timeout = timeout or self.config.read_timeout

        # Check circuit breaker
        if not bypass_circuit_breaker and not self._circuit_breaker.allow_request():
            raise QwenCircuitOpenError(
                f"Circuit breaker is open (state: {self._circuit_breaker.state.value})",
                recovery_time=self._circuit_breaker.recovery_timeout,
                details={
                    "failure_count": self._circuit_breaker._failure_count,
                    "threshold": self._circuit_breaker.failure_threshold,
                }
            )

        try:
            request = urllib.request.Request(
                url,
                data=json.dumps(data).encode("utf-8"),
                headers={"Content-Type": "application/json"},
                method="POST"
            )

            with urllib.request.urlopen(request, timeout=timeout) as response:
                response_text = response.read().decode("utf-8")

                try:
                    result = json.loads(response_text)
                    # Success - record with circuit breaker
                    if not bypass_circuit_breaker:
                        self._circuit_breaker.record_success()
                    return result
                except json.JSONDecodeError as e:
                    if not bypass_circuit_breaker:
                        self._circuit_breaker.record_failure()
                    raise QwenResponseError(
                        f"Invalid JSON response: {e}",
                        {"raw_response": response_text[:500]}
                    )

        except urllib.error.URLError as e:
            if not bypass_circuit_breaker:
                self._circuit_breaker.record_failure()
            if "timed out" in str(e).lower():
                raise QwenTimeoutError(
                    f"Request timed out after {timeout}s",
                    {"url": url, "timeout": timeout}
                )
            raise QwenConnectionError(
                f"Cannot connect to Qwen endpoint: {e}",
                {"url": url, "error": str(e)}
            )
        except TimeoutError:
            if not bypass_circuit_breaker:
                self._circuit_breaker.record_failure()
            raise QwenTimeoutError(
                f"Request timed out after {timeout}s",
                {"url": url, "timeout": timeout}
            )
        except (QwenResponseError, QwenCircuitOpenError):
            # Re-raise our exceptions without recording again
            raise
        except Exception as e:
            if not bypass_circuit_breaker:
                self._circuit_breaker.record_failure()
            raise QwenConnectionError(
                f"Unexpected error: {e}",
                {"url": url, "error": str(e)}
            )

    def _log_usage(self, response: QwenResponse, prompt_preview: str):
        """Log usage to JSONL file."""
        entry = {
            "timestamp": datetime.now().isoformat(),
            "model": response.model,
            "tokens_used": response.tokens_used,
            "prompt_tokens": response.prompt_tokens,
            "completion_tokens": response.completion_tokens,
            "execution_time": round(response.execution_time, 3),
            "success": response.success,
            "prompt_preview": prompt_preview[:100] if prompt_preview else "",
        }

        try:
            with open(self.config.usage_log_path, "a") as f:
                f.write(json.dumps(entry) + "\n")
        except Exception:
            pass  # Don't fail on logging errors

    def generate_sync(
        self,
        prompt: str,
        system_prompt: str = None,
        temperature: float = None,
        max_tokens: int = None,
        options: Dict[str, Any] = None,
        timeout: float = None,
    ) -> QwenResponse:
        """
        Generate text synchronously.

        Args:
            prompt: The user prompt
            system_prompt: Optional system prompt
            temperature: Temperature for generation (0.0-2.0)
            max_tokens: Maximum tokens to generate
            options: Additional Ollama options
            timeout: Request timeout in seconds

        Returns:
            QwenResponse with generated text

        Raises:
            QwenConnectionError: If cannot connect
            QwenTimeoutError: If request times out
            QwenResponseError: If response is invalid
        """
        start_time = time.time()

        # Build request
        request_data = {
            "model": self.config.model,
            "prompt": prompt,
            "stream": False,
        }

        # Add system prompt if provided
        if system_prompt:
            request_data["system"] = system_prompt

        # Build options
        gen_options = self.config.get_generation_options()
        if temperature is not None:
            gen_options["temperature"] = temperature
        if max_tokens is not None:
            gen_options["num_predict"] = max_tokens
        if options:
            gen_options.update(options)

        request_data["options"] = gen_options

        # Make request
        response_data = self._make_request(
            self.config.generate_url,
            request_data,
            timeout=timeout
        )

        execution_time = time.time() - start_time

        # Parse response
        response = QwenResponse(
            text=response_data.get("response", ""),
            model=response_data.get("model", self.config.model),
            thinking=response_data.get("thinking"),
            tokens_used=(
                response_data.get("prompt_eval_count", 0) +
                response_data.get("eval_count", 0)
            ),
            prompt_tokens=response_data.get("prompt_eval_count", 0),
            completion_tokens=response_data.get("eval_count", 0),
            execution_time=execution_time,
            success=True,
            raw_response=response_data,
        )

        # Log usage
        self._log_usage(response, prompt)

        return response

    def chat_sync(
        self,
        messages: List[Dict[str, str]],
        temperature: float = None,
        max_tokens: int = None,
        options: Dict[str, Any] = None,
        timeout: float = None,
    ) -> QwenResponse:
        """
        Chat completion synchronously.

        Args:
            messages: List of message dicts with 'role' and 'content'
            temperature: Temperature for generation
            max_tokens: Maximum tokens to generate
            options: Additional Ollama options
            timeout: Request timeout in seconds

        Returns:
            QwenResponse with assistant's reply
        """
        start_time = time.time()

        # Build request
        request_data = {
            "model": self.config.model,
            "messages": messages,
            "stream": False,
        }

        # Build options
        gen_options = self.config.get_generation_options()
        if temperature is not None:
            gen_options["temperature"] = temperature
        if max_tokens is not None:
            gen_options["num_predict"] = max_tokens
        if options:
            gen_options.update(options)

        request_data["options"] = gen_options

        # Make request
        response_data = self._make_request(
            self.config.chat_url,
            request_data,
            timeout=timeout
        )

        execution_time = time.time() - start_time

        # Parse response
        message = response_data.get("message", {})
        response = QwenResponse(
            text=message.get("content", ""),
            model=response_data.get("model", self.config.model),
            tokens_used=(
                response_data.get("prompt_eval_count", 0) +
                response_data.get("eval_count", 0)
            ),
            prompt_tokens=response_data.get("prompt_eval_count", 0),
            completion_tokens=response_data.get("eval_count", 0),
            execution_time=execution_time,
            success=True,
            raw_response=response_data,
        )

        # Log usage
        prompt_preview = messages[-1].get("content", "") if messages else ""
        self._log_usage(response, prompt_preview)

        return response

    async def _make_request_async(
        self,
        url: str,
        data: Dict[str, Any],
        timeout: float = None,
        bypass_circuit_breaker: bool = False,
    ) -> Dict[str, Any]:
        """
        Make async HTTP request to Ollama API.

        Requires httpx library for async support.
        """
        if not HTTPX_AVAILABLE:
            # Fallback to sync in thread pool
            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(
                None,
                lambda: self._make_request(url, data, timeout, bypass_circuit_breaker)
            )

        timeout = timeout or self.config.read_timeout

        # Check circuit breaker
        if not bypass_circuit_breaker and not self._circuit_breaker.allow_request():
            raise QwenCircuitOpenError(
                f"Circuit breaker is open",
                recovery_time=self._circuit_breaker.recovery_timeout,
            )

        try:
            async with httpx.AsyncClient(timeout=timeout) as client:
                response = await client.post(
                    url,
                    json=data,
                    headers={"Content-Type": "application/json"},
                )
                response.raise_for_status()
                result = response.json()

                if not bypass_circuit_breaker:
                    self._circuit_breaker.record_success()
                return result

        except httpx.TimeoutException:
            if not bypass_circuit_breaker:
                self._circuit_breaker.record_failure()
            raise QwenTimeoutError(f"Request timed out after {timeout}s")
        except httpx.HTTPError as e:
            if not bypass_circuit_breaker:
                self._circuit_breaker.record_failure()
            raise QwenConnectionError(f"HTTP error: {e}")
        except Exception as e:
            if not bypass_circuit_breaker:
                self._circuit_breaker.record_failure()
            raise QwenConnectionError(f"Unexpected error: {e}")

    async def generate(
        self,
        prompt: str,
        system_prompt: str = None,
        temperature: float = None,
        max_tokens: int = None,
        options: Dict[str, Any] = None,
        timeout: float = None,
    ) -> QwenResponse:
        """
        Generate text asynchronously.

        Same as generate_sync but async.
        """
        start_time = time.time()

        request_data = {
            "model": self.config.model,
            "prompt": prompt,
            "stream": False,
        }

        if system_prompt:
            request_data["system"] = system_prompt

        gen_options = self.config.get_generation_options()
        if temperature is not None:
            gen_options["temperature"] = temperature
        if max_tokens is not None:
            gen_options["num_predict"] = max_tokens
        if options:
            gen_options.update(options)

        request_data["options"] = gen_options

        response_data = await self._make_request_async(
            self.config.generate_url,
            request_data,
            timeout=timeout
        )

        execution_time = time.time() - start_time

        response = QwenResponse(
            text=response_data.get("response", ""),
            model=response_data.get("model", self.config.model),
            thinking=response_data.get("thinking"),
            tokens_used=(
                response_data.get("prompt_eval_count", 0) +
                response_data.get("eval_count", 0)
            ),
            prompt_tokens=response_data.get("prompt_eval_count", 0),
            completion_tokens=response_data.get("eval_count", 0),
            execution_time=execution_time,
            success=True,
            raw_response=response_data,
        )

        self._log_usage(response, prompt)
        return response

    async def chat(
        self,
        messages: List[Dict[str, str]],
        temperature: float = None,
        max_tokens: int = None,
        options: Dict[str, Any] = None,
        timeout: float = None,
    ) -> QwenResponse:
        """
        Chat completion asynchronously.

        Same as chat_sync but async.
        """
        start_time = time.time()

        request_data = {
            "model": self.config.model,
            "messages": messages,
            "stream": False,
        }

        gen_options = self.config.get_generation_options()
        if temperature is not None:
            gen_options["temperature"] = temperature
        if max_tokens is not None:
            gen_options["num_predict"] = max_tokens
        if options:
            gen_options.update(options)

        request_data["options"] = gen_options

        response_data = await self._make_request_async(
            self.config.chat_url,
            request_data,
            timeout=timeout
        )

        execution_time = time.time() - start_time

        message = response_data.get("message", {})
        response = QwenResponse(
            text=message.get("content", ""),
            model=response_data.get("model", self.config.model),
            tokens_used=(
                response_data.get("prompt_eval_count", 0) +
                response_data.get("eval_count", 0)
            ),
            prompt_tokens=response_data.get("prompt_eval_count", 0),
            completion_tokens=response_data.get("eval_count", 0),
            execution_time=execution_time,
            success=True,
            raw_response=response_data,
        )

        prompt_preview = messages[-1].get("content", "") if messages else ""
        self._log_usage(response, prompt_preview)
        return response

    def check_model_loaded(self) -> bool:
        """
        Check if the configured model is loaded in Ollama.

        Returns:
            True if model is loaded, False otherwise
        """
        try:
            request = urllib.request.Request(self.config.tags_url)
            with urllib.request.urlopen(
                request, timeout=self.config.connect_timeout
            ) as response:
                data = json.loads(response.read().decode())
                models = [m.get("name") for m in data.get("models", [])]
                return self.config.model in models
        except Exception:
            return False

    def get_running_models(self) -> List[Dict[str, Any]]:
        """
        Get list of currently running (loaded) models.

        Returns:
            List of model info dicts
        """
        try:
            request = urllib.request.Request(self.config.ps_url)
            with urllib.request.urlopen(
                request, timeout=self.config.connect_timeout
            ) as response:
                data = json.loads(response.read().decode())
                return data.get("models", [])
        except Exception:
            return []

    def ping(self, timeout: float = None) -> Dict[str, Any]:
        """
        Send a lightweight ping to keep model warm.

        Returns:
            Dict with status, latency, and model info
        """
        timeout = timeout or self.config.warmth_ping_timeout

        start = time.time()
        try:
            response = self.generate_sync(
                prompt="1+1=",
                max_tokens=5,
                timeout=timeout
            )
            latency = time.time() - start

            return {
                "status": "warm",
                "latency_seconds": round(latency, 3),
                "model": response.model,
                "response": response.text.strip(),
            }
        except QwenTimeoutError:
            return {
                "status": "cold",
                "latency_seconds": time.time() - start,
                "error": "timeout",
            }
        except Exception as e:
            return {
                "status": "error",
                "latency_seconds": time.time() - start,
                "error": str(e),
            }

    def get_circuit_breaker_status(self) -> Dict[str, Any]:
        """
        Get circuit breaker status.

        Returns:
            Dict with state, failure count, and config
        """
        return self._circuit_breaker.get_status()

    def reset_circuit_breaker(self):
        """
        Reset circuit breaker to closed state.

        Use with caution - only when you know the endpoint is healthy.
        """
        self._circuit_breaker.reset()

    def get_status(self) -> Dict[str, Any]:
        """
        Get comprehensive client status.

        Returns:
            Dict with connection status, model info, circuit breaker, and config
        """
        return {
            "endpoint": self.config.base_url,
            "model": self.config.model,
            "model_loaded": self.check_model_loaded(),
            "running_models": self.get_running_models(),
            "circuit_breaker": self.get_circuit_breaker_status(),
            "config": self.config.to_dict(),
        }

    @classmethod
    def reset_singleton(cls):
        """Reset singleton instance (for testing only)."""
        with cls._lock:
            cls._instance = None


# Convenience function
def get_qwen_client() -> UnifiedQwenClient:
    """Get the unified Qwen client singleton."""
    return UnifiedQwenClient()


# VERIFICATION_STAMP
# Story: STORY-003 (Circuit Breaker Integration)
# Verified By: CLAUDE
# Verified At: 2026-01-22
# Tests: Pending
# Coverage: Pending