"""
Genesis Self-Optimizer - Autonomous Prompt Evolution System

Implements the OpenAI cookbook's self-evolving agent pattern:
1. Evaluation Framework (4 graders)
2. Metaprompt Generator
3. Retraining Loop
4. Version Tracking with Rollback

Based on: https://cookbook.openai.com/examples/partners/self_evolving_agents/autonomous_agent_retraining

Usage:
    from self_optimizer import SelfOptimizer

    optimizer = SelfOptimizer()

    # Record task execution
    optimizer.record_execution(
        agent_name="discovery_agent",
        prompt_version="v1.0",
        task_input="Scan arXiv for agent papers",
        task_output="Found 31 papers",
        expected_output="Find relevant agent papers",
        execution_time_ms=5000
    )

    # Evaluate and optimize
    results = optimizer.evaluate_agent("discovery_agent")
    if results["needs_optimization"]:
        new_prompt = optimizer.generate_optimized_prompt("discovery_agent")
"""

import json
import hashlib
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict
from enum import Enum

try:
    import psycopg2
    from psycopg2.extras import RealDictCursor
except ImportError:
    psycopg2 = None

try:
    from anthropic import Anthropic
except ImportError:
    Anthropic = None


class GraderType(Enum):
    """Four evaluation graders from OpenAI cookbook."""
    PYTHON_VALIDATION = "python"      # Structured checks (format, schema)
    DEVIATION_SCORE = "deviation"     # Quantitative deviation from expected
    SEMANTIC_SIMILARITY = "semantic"  # Cosine similarity matching
    LLM_JUDGE = "llm_judge"          # Qualitative LLM assessment


@dataclass
class EvaluationResult:
    """Result from a single grader evaluation."""
    grader_type: str
    score: float  # 0.0 to 1.0
    passed: bool
    feedback: str
    details: Dict[str, Any]


@dataclass
class PromptVersion:
    """Versioned prompt with performance tracking."""
    version: str
    agent_name: str
    prompt_text: str
    created_at: str
    performance_score: float
    execution_count: int
    is_active: bool
    parent_version: Optional[str] = None


@dataclass
class ExecutionRecord:
    """Record of a single agent execution for learning."""
    agent_name: str
    prompt_version: str
    task_input: str
    task_output: str
    expected_output: Optional[str]
    execution_time_ms: int
    evaluation_scores: Dict[str, float]
    timestamp: str
    success: bool


class SelfOptimizer:
    """
    Genesis Self-Optimizer - Implements autonomous prompt evolution.

    Key Components:
    1. Execution Recording - Store all agent executions for learning
    2. Multi-Grader Evaluation - 4 evaluation methods for comprehensive assessment
    3. Metaprompt Generation - LLM generates improved prompts
    4. Version Control - Track all prompt versions with rollback capability
    """

    def __init__(self, db_config: Optional[Dict] = None):
        """Initialize with database connection."""
        if db_config is None:
            # Default to Genesis Elestio PostgreSQL
            from elestio_config import PostgresConfig
            db_config = PostgresConfig.get_connection_params()

        self.db_config = db_config
        self._ensure_schema()

        # Initialize Anthropic client if available
        self.claude_client = None
        if Anthropic:
            import os
            api_key = os.environ.get("ANTHROPIC_API_KEY")
            if api_key:
                self.claude_client = Anthropic(api_key=api_key)

    def _get_connection(self):
        """Get PostgreSQL connection."""
        if psycopg2 is None:
            raise ImportError("psycopg2 not installed")
        return psycopg2.connect(**self.db_config)

    def _ensure_schema(self):
        """Ensure self-optimizer tables exist."""
        conn = self._get_connection()
        cursor = conn.cursor()

        cursor.execute("""
            CREATE TABLE IF NOT EXISTS prompt_versions (
                id SERIAL PRIMARY KEY,
                version VARCHAR(50) NOT NULL,
                agent_name VARCHAR(200) NOT NULL,
                prompt_text TEXT NOT NULL,
                prompt_hash VARCHAR(64) NOT NULL,
                created_at TIMESTAMPTZ DEFAULT NOW(),
                performance_score DECIMAL(5,4) DEFAULT 0.0,
                execution_count INTEGER DEFAULT 0,
                is_active BOOLEAN DEFAULT FALSE,
                parent_version VARCHAR(50),
                metadata JSONB DEFAULT '{}',
                UNIQUE(agent_name, version)
            );

            CREATE TABLE IF NOT EXISTS execution_records (
                id SERIAL PRIMARY KEY,
                agent_name VARCHAR(200) NOT NULL,
                prompt_version VARCHAR(50) NOT NULL,
                task_input TEXT,
                task_output TEXT,
                expected_output TEXT,
                execution_time_ms INTEGER,
                evaluation_scores JSONB DEFAULT '{}',
                grader_feedback JSONB DEFAULT '{}',
                timestamp TIMESTAMPTZ DEFAULT NOW(),
                success BOOLEAN DEFAULT TRUE
            );

            CREATE TABLE IF NOT EXISTS optimization_runs (
                id SERIAL PRIMARY KEY,
                agent_name VARCHAR(200) NOT NULL,
                run_at TIMESTAMPTZ DEFAULT NOW(),
                old_version VARCHAR(50),
                new_version VARCHAR(50),
                improvement_delta DECIMAL(5,4),
                trigger_reason TEXT,
                metaprompt_used TEXT,
                status VARCHAR(50) DEFAULT 'completed'
            );

            CREATE INDEX IF NOT EXISTS idx_exec_agent ON execution_records(agent_name);
            CREATE INDEX IF NOT EXISTS idx_exec_timestamp ON execution_records(timestamp);
            CREATE INDEX IF NOT EXISTS idx_prompt_active ON prompt_versions(agent_name, is_active);
        """)

        conn.commit()
        conn.close()

    def record_execution(
        self,
        agent_name: str,
        prompt_version: str,
        task_input: str,
        task_output: str,
        expected_output: Optional[str] = None,
        execution_time_ms: int = 0,
        success: bool = True
    ) -> int:
        """
        Record an agent execution for learning.

        Returns:
            Execution record ID
        """
        # Run all graders
        evaluation_scores = {}
        grader_feedback = {}

        for grader_type in GraderType:
            result = self._run_grader(
                grader_type,
                task_input,
                task_output,
                expected_output
            )
            evaluation_scores[grader_type.value] = result.score
            grader_feedback[grader_type.value] = {
                "passed": result.passed,
                "feedback": result.feedback,
                "details": result.details
            }

        # Store execution record
        conn = self._get_connection()
        cursor = conn.cursor()

        cursor.execute("""
            INSERT INTO execution_records
            (agent_name, prompt_version, task_input, task_output, expected_output,
             execution_time_ms, evaluation_scores, grader_feedback, success)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
            RETURNING id
        """, (
            agent_name,
            prompt_version,
            task_input,
            task_output,
            expected_output,
            execution_time_ms,
            json.dumps(evaluation_scores),
            json.dumps(grader_feedback),
            success
        ))

        record_id = cursor.fetchone()[0]

        # Update prompt version stats
        cursor.execute("""
            UPDATE prompt_versions
            SET execution_count = execution_count + 1,
                performance_score = (
                    SELECT AVG((evaluation_scores->>'python')::float +
                               (evaluation_scores->>'deviation')::float +
                               (evaluation_scores->>'semantic')::float +
                               (evaluation_scores->>'llm_judge')::float) / 4.0
                    FROM execution_records
                    WHERE agent_name = %s AND prompt_version = %s
                )
            WHERE agent_name = %s AND version = %s
        """, (agent_name, prompt_version, agent_name, prompt_version))

        conn.commit()
        conn.close()

        return record_id

    def _run_grader(
        self,
        grader_type: GraderType,
        task_input: str,
        task_output: str,
        expected_output: Optional[str]
    ) -> EvaluationResult:
        """Run a specific grader on execution output."""

        if grader_type == GraderType.PYTHON_VALIDATION:
            return self._grade_python_validation(task_output, expected_output)
        elif grader_type == GraderType.DEVIATION_SCORE:
            return self._grade_deviation(task_output, expected_output)
        elif grader_type == GraderType.SEMANTIC_SIMILARITY:
            return self._grade_semantic(task_output, expected_output)
        elif grader_type == GraderType.LLM_JUDGE:
            return self._grade_llm_judge(task_input, task_output, expected_output)
        else:
            return EvaluationResult(
                grader_type=grader_type.value,
                score=0.5,
                passed=True,
                feedback="Unknown grader type",
                details={}
            )

    def _grade_python_validation(
        self,
        output: str,
        expected: Optional[str]
    ) -> EvaluationResult:
        """
        Grader 1: Python Validation
        Checks format, structure, completeness.
        """
        score = 1.0
        issues = []

        # Check output is not empty
        if not output or len(output.strip()) == 0:
            score -= 0.5
            issues.append("Output is empty")

        # Check for error indicators
        error_keywords = ["error", "exception", "failed", "traceback"]
        output_lower = output.lower()
        for keyword in error_keywords:
            if keyword in output_lower:
                score -= 0.2
                issues.append(f"Contains error indicator: {keyword}")
                break

        # Check for JSON validity if output looks like JSON
        if output.strip().startswith("{") or output.strip().startswith("["):
            try:
                json.loads(output)
            except json.JSONDecodeError:
                score -= 0.3
                issues.append("Invalid JSON structure")

        score = max(0.0, min(1.0, score))

        return EvaluationResult(
            grader_type="python",
            score=score,
            passed=score >= 0.7,
            feedback="; ".join(issues) if issues else "Passed all validation checks",
            details={"issues": issues}
        )

    def _grade_deviation(
        self,
        output: str,
        expected: Optional[str]
    ) -> EvaluationResult:
        """
        Grader 2: Deviation Scoring
        Quantitative comparison to expected output.
        """
        if not expected:
            # No expected output, assume passing
            return EvaluationResult(
                grader_type="deviation",
                score=0.8,
                passed=True,
                feedback="No expected output to compare",
                details={"has_expected": False}
            )

        # Calculate character-level similarity
        output_chars = set(output.lower())
        expected_chars = set(expected.lower())

        if len(output_chars) == 0 or len(expected_chars) == 0:
            score = 0.0
        else:
            intersection = output_chars & expected_chars
            union = output_chars | expected_chars
            score = len(intersection) / len(union)  # Jaccard similarity

        # Calculate length deviation
        len_ratio = min(len(output), len(expected)) / max(len(output), len(expected), 1)

        # Combined score
        final_score = (score * 0.7) + (len_ratio * 0.3)

        return EvaluationResult(
            grader_type="deviation",
            score=final_score,
            passed=final_score >= 0.5,
            feedback=f"Character overlap: {score:.2%}, Length ratio: {len_ratio:.2%}",
            details={"char_similarity": score, "length_ratio": len_ratio}
        )

    def _grade_semantic(
        self,
        output: str,
        expected: Optional[str]
    ) -> EvaluationResult:
        """
        Grader 3: Semantic Similarity
        Uses word overlap for semantic matching (simplified).
        Full implementation would use embeddings.
        """
        if not expected:
            return EvaluationResult(
                grader_type="semantic",
                score=0.8,
                passed=True,
                feedback="No expected output for semantic comparison",
                details={"has_expected": False}
            )

        # Simple word-based semantic similarity
        import re

        def tokenize(text: str) -> set:
            words = re.findall(r'\b\w+\b', text.lower())
            return set(words)

        output_words = tokenize(output)
        expected_words = tokenize(expected)

        if len(output_words) == 0 or len(expected_words) == 0:
            score = 0.0
        else:
            intersection = output_words & expected_words
            union = output_words | expected_words
            score = len(intersection) / len(union)

        # Boost score for key concept matches
        key_concepts = ["agent", "autonomous", "llm", "ai", "learning", "evolution"]
        concept_matches = sum(1 for c in key_concepts if c in output_words and c in expected_words)
        concept_boost = min(0.2, concept_matches * 0.05)

        final_score = min(1.0, score + concept_boost)

        return EvaluationResult(
            grader_type="semantic",
            score=final_score,
            passed=final_score >= 0.4,
            feedback=f"Word overlap: {score:.2%}, Concept matches: {concept_matches}",
            details={"word_similarity": score, "concept_matches": concept_matches}
        )

    def _grade_llm_judge(
        self,
        task_input: str,
        task_output: str,
        expected_output: Optional[str]
    ) -> EvaluationResult:
        """
        Grader 4: LLM-as-Judge
        Qualitative assessment using Claude.
        """
        if not self.claude_client:
            # Fallback: heuristic-based judgment
            score = 0.7
            if len(task_output) > 100:
                score += 0.1
            if expected_output and any(w in task_output.lower() for w in expected_output.lower().split()[:5]):
                score += 0.1

            return EvaluationResult(
                grader_type="llm_judge",
                score=min(1.0, score),
                passed=True,
                feedback="Heuristic evaluation (no API key available)",
                details={"method": "heuristic"}
            )

        try:
            prompt = f"""You are an AI output quality evaluator. Score the following agent output.

TASK INPUT:
{task_input[:500]}

AGENT OUTPUT:
{task_output[:1000]}

{f'EXPECTED OUTPUT: {expected_output[:500]}' if expected_output else ''}

Rate the output quality on these dimensions:
1. Relevance (0-1): Does the output address the task?
2. Completeness (0-1): Is the output thorough?
3. Accuracy (0-1): Is the information correct?
4. Clarity (0-1): Is the output clear and well-structured?

Respond in JSON format:
{{"relevance": X.X, "completeness": X.X, "accuracy": X.X, "clarity": X.X, "overall": X.X, "feedback": "..."}}"""

            response = self.claude_client.messages.create(
                model="claude-3-haiku-20240307",  # Use Haiku for cost efficiency
                max_tokens=200,
                messages=[{"role": "user", "content": prompt}]
            )

            result_text = response.content[0].text

            # Parse JSON response
            try:
                result = json.loads(result_text)
                score = result.get("overall", 0.7)
                feedback = result.get("feedback", "LLM evaluation complete")
            except json.JSONDecodeError:
                score = 0.7
                feedback = "Could not parse LLM response"

            return EvaluationResult(
                grader_type="llm_judge",
                score=score,
                passed=score >= 0.6,
                feedback=feedback,
                details={"method": "claude", "raw_response": result_text[:200]}
            )

        except Exception as e:
            return EvaluationResult(
                grader_type="llm_judge",
                score=0.7,
                passed=True,
                feedback=f"LLM evaluation error: {str(e)[:100]}",
                details={"method": "error", "error": str(e)}
            )

    def evaluate_agent(
        self,
        agent_name: str,
        min_executions: int = 5,
        lookback_hours: int = 24
    ) -> Dict[str, Any]:
        """
        Evaluate an agent's recent performance and determine if optimization is needed.

        Returns:
            Dictionary with evaluation results and optimization recommendation
        """
        conn = self._get_connection()
        cursor = conn.cursor(cursor_factory=RealDictCursor)

        # Get recent executions
        cursor.execute("""
            SELECT evaluation_scores, success, execution_time_ms
            FROM execution_records
            WHERE agent_name = %s
              AND timestamp > NOW() - INTERVAL '%s hours'
            ORDER BY timestamp DESC
        """, (agent_name, lookback_hours))

        executions = cursor.fetchall()
        conn.close()

        if len(executions) < min_executions:
            return {
                "agent_name": agent_name,
                "evaluation_status": "insufficient_data",
                "execution_count": len(executions),
                "needs_optimization": False,
                "reason": f"Need {min_executions} executions, have {len(executions)}"
            }

        # Calculate aggregate scores
        grader_scores = {g.value: [] for g in GraderType}
        success_rate = 0
        avg_time = 0

        for exec in executions:
            scores = exec["evaluation_scores"]
            for grader, score in scores.items():
                if grader in grader_scores:
                    grader_scores[grader].append(score)
            if exec["success"]:
                success_rate += 1
            avg_time += exec.get("execution_time_ms", 0)

        success_rate /= len(executions)
        avg_time /= len(executions)

        avg_scores = {g: sum(s)/len(s) if s else 0 for g, s in grader_scores.items()}
        overall_score = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0

        # Determine if optimization is needed
        needs_optimization = (
            overall_score < 0.7 or
            success_rate < 0.9 or
            any(s < 0.5 for s in avg_scores.values())
        )

        optimization_reason = None
        if needs_optimization:
            if overall_score < 0.7:
                optimization_reason = f"Overall score {overall_score:.2%} below threshold"
            elif success_rate < 0.9:
                optimization_reason = f"Success rate {success_rate:.2%} below threshold"
            else:
                low_graders = [g for g, s in avg_scores.items() if s < 0.5]
                optimization_reason = f"Low scores on: {', '.join(low_graders)}"

        return {
            "agent_name": agent_name,
            "evaluation_status": "completed",
            "execution_count": len(executions),
            "grader_scores": avg_scores,
            "overall_score": overall_score,
            "success_rate": success_rate,
            "avg_execution_time_ms": avg_time,
            "needs_optimization": needs_optimization,
            "optimization_reason": optimization_reason
        }

    def register_prompt(
        self,
        agent_name: str,
        prompt_text: str,
        version: Optional[str] = None,
        parent_version: Optional[str] = None,
        set_active: bool = True
    ) -> str:
        """
        Register a new prompt version.

        Returns:
            The version identifier
        """
        if version is None:
            # Auto-generate version
            timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
            version = f"v{timestamp}"

        prompt_hash = hashlib.sha256(prompt_text.encode()).hexdigest()[:16]

        conn = self._get_connection()
        cursor = conn.cursor()

        # If setting active, deactivate other versions first
        if set_active:
            cursor.execute("""
                UPDATE prompt_versions
                SET is_active = FALSE
                WHERE agent_name = %s
            """, (agent_name,))

        cursor.execute("""
            INSERT INTO prompt_versions
            (version, agent_name, prompt_text, prompt_hash, is_active, parent_version)
            VALUES (%s, %s, %s, %s, %s, %s)
            ON CONFLICT (agent_name, version) DO UPDATE
            SET prompt_text = EXCLUDED.prompt_text,
                prompt_hash = EXCLUDED.prompt_hash,
                is_active = EXCLUDED.is_active
        """, (version, agent_name, prompt_text, prompt_hash, set_active, parent_version))

        conn.commit()
        conn.close()

        return version

    def get_active_prompt(self, agent_name: str) -> Optional[PromptVersion]:
        """Get the currently active prompt for an agent."""
        conn = self._get_connection()
        cursor = conn.cursor(cursor_factory=RealDictCursor)

        cursor.execute("""
            SELECT version, agent_name, prompt_text, created_at,
                   performance_score, execution_count, is_active, parent_version
            FROM prompt_versions
            WHERE agent_name = %s AND is_active = TRUE
        """, (agent_name,))

        row = cursor.fetchone()
        conn.close()

        if row:
            return PromptVersion(
                version=row["version"],
                agent_name=row["agent_name"],
                prompt_text=row["prompt_text"],
                created_at=str(row["created_at"]),
                performance_score=float(row["performance_score"]),
                execution_count=row["execution_count"],
                is_active=row["is_active"],
                parent_version=row["parent_version"]
            )
        return None

    def generate_optimized_prompt(
        self,
        agent_name: str,
        register: bool = True
    ) -> Optional[str]:
        """
        Generate an optimized prompt using metaprompt technique.

        Returns:
            New optimized prompt text, or None if optimization not possible
        """
        if not self.claude_client:
            return None

        # Get current prompt
        current = self.get_active_prompt(agent_name)
        if not current:
            return None

        # Get recent failure patterns
        conn = self._get_connection()
        cursor = conn.cursor(cursor_factory=RealDictCursor)

        cursor.execute("""
            SELECT task_input, task_output, grader_feedback
            FROM execution_records
            WHERE agent_name = %s
              AND (success = FALSE OR
                   (evaluation_scores->>'python')::float < 0.7 OR
                   (evaluation_scores->>'llm_judge')::float < 0.7)
            ORDER BY timestamp DESC
            LIMIT 5
        """, (agent_name,))

        failures = cursor.fetchall()
        conn.close()

        failure_examples = ""
        for f in failures:
            failure_examples += f"\nInput: {f['task_input'][:200]}\n"
            failure_examples += f"Output: {f['task_output'][:200]}\n"
            failure_examples += f"Feedback: {json.dumps(f['grader_feedback'])[:300]}\n"

        # Generate optimized prompt using metaprompt
        metaprompt = f"""You are a prompt optimization expert. Your task is to improve an agent's system prompt based on failure analysis.

CURRENT PROMPT:
{current.prompt_text[:2000]}

RECENT FAILURES AND FEEDBACK:
{failure_examples if failure_examples else "No recent failures recorded"}

PERFORMANCE METRICS:
- Current performance score: {current.performance_score:.2%}
- Total executions: {current.execution_count}

TASK: Generate an improved version of this prompt that addresses the failure patterns.
Focus on:
1. Clearer instructions for edge cases
2. Better output format specifications
3. More specific success criteria
4. Error handling guidance

Respond with ONLY the improved prompt text, no explanations."""

        try:
            response = self.claude_client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=2000,
                messages=[{"role": "user", "content": metaprompt}]
            )

            new_prompt = response.content[0].text

            if register:
                new_version = self.register_prompt(
                    agent_name=agent_name,
                    prompt_text=new_prompt,
                    parent_version=current.version,
                    set_active=True
                )

                # Record optimization run
                conn = self._get_connection()
                cursor = conn.cursor()
                cursor.execute("""
                    INSERT INTO optimization_runs
                    (agent_name, old_version, new_version, trigger_reason, metaprompt_used)
                    VALUES (%s, %s, %s, %s, %s)
                """, (
                    agent_name,
                    current.version,
                    new_version,
                    "Performance below threshold",
                    metaprompt[:500]
                ))
                conn.commit()
                conn.close()

            return new_prompt

        except Exception as e:
            print(f"Prompt optimization error: {e}")
            return None

    def rollback_prompt(self, agent_name: str, to_version: str) -> bool:
        """
        Rollback to a previous prompt version.

        Returns:
            True if successful
        """
        conn = self._get_connection()
        cursor = conn.cursor()

        # Deactivate all versions
        cursor.execute("""
            UPDATE prompt_versions
            SET is_active = FALSE
            WHERE agent_name = %s
        """, (agent_name,))

        # Activate target version
        cursor.execute("""
            UPDATE prompt_versions
            SET is_active = TRUE
            WHERE agent_name = %s AND version = %s
        """, (agent_name, to_version))

        affected = cursor.rowcount
        conn.commit()
        conn.close()

        return affected > 0

    def get_optimization_history(
        self,
        agent_name: str,
        limit: int = 10
    ) -> List[Dict[str, Any]]:
        """Get history of optimization runs for an agent."""
        conn = self._get_connection()
        cursor = conn.cursor(cursor_factory=RealDictCursor)

        cursor.execute("""
            SELECT run_at, old_version, new_version, improvement_delta,
                   trigger_reason, status
            FROM optimization_runs
            WHERE agent_name = %s
            ORDER BY run_at DESC
            LIMIT %s
        """, (agent_name, limit))

        rows = cursor.fetchall()
        conn.close()

        return [dict(r) for r in rows]

    def get_evolution_insights(self) -> Dict[str, Any]:
        """
        Get insights about overall system evolution.

        Returns:
            Dictionary with evolution metrics and insights
        """
        conn = self._get_connection()
        cursor = conn.cursor(cursor_factory=RealDictCursor)

        # Get agent performance summary
        cursor.execute("""
            SELECT agent_name,
                   COUNT(*) as execution_count,
                   AVG((evaluation_scores->>'python')::float) as avg_python,
                   AVG((evaluation_scores->>'llm_judge')::float) as avg_llm,
                   SUM(CASE WHEN success THEN 1 ELSE 0 END)::float / COUNT(*) as success_rate
            FROM execution_records
            WHERE timestamp > NOW() - INTERVAL '7 days'
            GROUP BY agent_name
        """)

        agent_stats = cursor.fetchall()

        # Get optimization trends
        cursor.execute("""
            SELECT DATE(run_at) as date, COUNT(*) as optimizations
            FROM optimization_runs
            WHERE run_at > NOW() - INTERVAL '30 days'
            GROUP BY DATE(run_at)
            ORDER BY date
        """)

        optimization_trend = cursor.fetchall()

        # Get prompt evolution count
        cursor.execute("""
            SELECT agent_name, COUNT(*) as versions
            FROM prompt_versions
            GROUP BY agent_name
        """)

        version_counts = cursor.fetchall()

        conn.close()

        return {
            "agent_performance": [dict(a) for a in agent_stats],
            "optimization_trend": [dict(o) for o in optimization_trend],
            "version_counts": {v["agent_name"]: v["versions"] for v in version_counts},
            "total_agents": len(agent_stats),
            "total_optimizations": sum(o["optimizations"] for o in optimization_trend),
            "generated_at": datetime.now(timezone.utc).isoformat()
        }


# Convenience functions for Modal agents
def create_optimizer() -> SelfOptimizer:
    """Create optimizer with Elestio config."""
    return SelfOptimizer()


def record_and_optimize(
    agent_name: str,
    prompt_version: str,
    task_input: str,
    task_output: str,
    expected_output: Optional[str] = None,
    auto_optimize: bool = True
) -> Dict[str, Any]:
    """
    Record execution and trigger optimization if needed.

    Returns:
        Dictionary with recording result and any optimization actions taken
    """
    optimizer = create_optimizer()

    # Record execution
    record_id = optimizer.record_execution(
        agent_name=agent_name,
        prompt_version=prompt_version,
        task_input=task_input,
        task_output=task_output,
        expected_output=expected_output
    )

    result = {
        "record_id": record_id,
        "agent_name": agent_name,
        "optimization_triggered": False
    }

    if auto_optimize:
        # Evaluate if optimization is needed
        evaluation = optimizer.evaluate_agent(agent_name)
        result["evaluation"] = evaluation

        if evaluation.get("needs_optimization"):
            new_prompt = optimizer.generate_optimized_prompt(agent_name)
            if new_prompt:
                result["optimization_triggered"] = True
                result["optimization_reason"] = evaluation.get("optimization_reason")

    return result


if __name__ == "__main__":
    # Test the self-optimizer
    print("Genesis Self-Optimizer")
    print("=" * 50)

    optimizer = SelfOptimizer()

    # Register a test prompt
    version = optimizer.register_prompt(
        agent_name="test_agent",
        prompt_text="You are a helpful AI assistant that answers questions accurately.",
        version="v1.0"
    )
    print(f"Registered prompt version: {version}")

    # Record some test executions
    for i in range(3):
        record_id = optimizer.record_execution(
            agent_name="test_agent",
            prompt_version="v1.0",
            task_input="What is 2 + 2?",
            task_output="The answer is 4.",
            expected_output="4",
            execution_time_ms=100 + i * 10
        )
        print(f"Recorded execution: {record_id}")

    # Get evolution insights
    insights = optimizer.get_evolution_insights()
    print(f"\nEvolution Insights:")
    print(f"  Total agents: {insights['total_agents']}")
    print(f"  Total optimizations: {insights['total_optimizations']}")