#!/usr/bin/env python3
"""
Genesis Gemini 3 Vision Bridge
===============================
Optimizes visual reasoning for browser control. 
Maps screenshots to semantic actions and coordinates.
"""

import os
import sys
import json
import base64
from typing import Dict, Any, List

# Add genesis-system to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from google import genai
from google.genai import types

# Load config
CONFIG_PATH = os.path.join(os.path.dirname(__file__), "genesis_config.json")
with open(CONFIG_PATH) as f:
    CONFIG = json.load(f)

class Gemini3VisionBridge:
    """
    Expert bridge for Gemini 3 Vision capabilities.
    Processes screenshots to identify interactive elements.
    """
    
    def __init__(self):
        api_key = CONFIG["gemini"]["api_key"]
        self.client = genai.Client(api_key=api_key)
        print("[OK] Gemini 3 Vision Bridge Active.")

    def analyze_ui(self, screenshot_path: str, objective: str) -> Dict[str, Any]:
        """
        Send screenshot to Gemini 3 to find coordinates for the objective.
        """
        with open(screenshot_path, "rb") as image_file:
            img_data = base64.b64encode(image_file.read()).decode('utf-8')

        prompt = f"""You are the Genesis Browser Master using Gemini 3 Vision.
OBJECTIVE: {objective}

TASK:
1. Analyze the provided screenshot of the web page.
2. Identify the element that needs to be interacted with to achieve the objective.
3. Provide the [x, y] coordinates (normalized 0-1000) for the target.
4. Output a JSON object with:
   - "action": "click", "type", "scroll", etc.
   - "target_description": "What you found"
   - "coordinates": [x, y]
   - "reasoning": "Why this element"
"""

        response = self.client.models.generate_content(
            model='gemini-2.0-flash', # Using Flash for speed, Gemini 3 logic in prompt
            contents=[
                prompt,
                types.Part.from_bytes(data=base64.b64decode(img_data), mime_type="image/png")
            ],
            config=types.GenerateContentConfig(
                response_mime_type='application/json'
            )
        )

        try:
            return json.loads(response.text)
        except:
            print("[ERR] Vision Bridge failed to parse AI feedback.")
            return {"action": "wait", "reasoning": "Failed to parse UI analysis."}

if __name__ == "__main__":
    bridge = Gemini3VisionBridge()
    print("[TEST] Bridge Ready.")
