"""
Genesis Superior Browser (GSB) — Vision Bridge (Layer 2)
=======================================================
Integrates Gemini Flash Vision to reason about page state and identify coordinates.
"""

import base64
import json
import logging
from typing import Dict, List, Optional
from playwright.async_api import Page

logger = logging.getLogger("genesis_v2.core.browser.vision")

class VisionBridge:
    def __init__(self, gemini_client=None):
        self.client = gemini_client

    async def analyze_page(self, page: Page, goal: str) -> Dict:
        """
        Takes a screenshot, sends it to Gemini Flash, and returns 
        the coordinates of the element to interact with.
        """
        screenshot_bytes = await page.screenshot(type="png", full_page=False)

        prompt = f"""
        Analyze this screenshot of a webpage. 
        Your goal is: {goal}
        
        Return a JSON object with:
        1. "reasoning": Why you choosing this action.
        2. "action": One of ["click", "type", "scroll", "wait"].
        3. "coordinates": {{"x": Number, "y": Number}} of the target element.
        4. "selector": (Optional) The CSS selector if you can identify it reliably.
        5. "input_text": (Optional) If action is 'type'.

        The screen resolution is 1920x1080.
        ONLY return JSON.
        """

        if not self.client:
            # Fallback for when no internal client is passed — we use a generic placeholder 
            # or logic to call the main Genesis Gemini executor.
            logger.warning("No Gemini client provided to VisionBridge. Visual reasoning limited.")
            return {"action": "wait", "reasoning": "No vision brain connected."}

        try:
            image_parts = [
                {
                    "mime_type": "image/png",
                    "data": screenshot_bytes
                }
            ]
            response = self.client.generate_content([prompt, image_parts[0]])
            
            # Extract JSON from markdown
            import re
            match = re.search(r"```json\n(.*?)\n```", response.text, re.DOTALL)
            if match:
                json_text = match.group(1)
            else:
                json_text = response.text

            try:
                return json.loads(json_text)
            except json.JSONDecodeError:
                logger.error(f"Failed to parse JSON from vision model. Raw response: {response.text}")
                return {"action": "wait", "reasoning": "Failed to parse response from vision model."}
        except Exception as e:
            logger.error(f"Vision analysis failed: {e}")
            return {"error": str(e)}

    async def execute_vision_move(self, page: Page, vision_result: Dict):
        """Executes the action determined by the vision brain."""
        action = vision_result.get("action")
        coords = vision_result.get("coordinates")

        if action == "click" and coords:
            await page.mouse.click(coords["x"], coords["y"])
            logger.info(f"Vision-grounded click executed at {coords}")
        elif action == "type" and coords:
            await page.mouse.click(coords["x"], coords["y"])
            await page.keyboard.type(vision_result.get("input_text", ""))
            logger.info(f"Vision-grounded typing executed at {coords}")
        elif action == "scroll":
            await page.mouse.wheel(0, 500)
            logger.info("Vision-grounded scroll executed.")
        
        return vision_result
