"""
Genesis Clone Pipeline — Content Extraction Module
===================================================
Extracts content from target websites using a layered fallback approach:
  Layer 1: Jina Reader API (free, fast, LLM-ready markdown)
  Layer 2: Crawl4AI (open-source, deep crawl, images/CSS/structure)
  Layer 3: Playwright screenshot (visual reference)
  Layer 4: Raw requests fallback (basic HTML)

No SQLite. Output is file-based or passed in-memory.
"""

import os
import sys
import json
import time
import logging
import asyncio
import hashlib
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse, urljoin

import requests

logger = logging.getLogger("clone_pipeline.extractor")

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

JINA_BASE = "https://r.jina.ai/"
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/131.0.0.0 Safari/537.36"
    )
}

SCREENSHOT_DIR = Path("/mnt/e/genesis-system/scripts/clone_pipeline/output")


# ---------------------------------------------------------------------------
# Layer 1: Jina Reader
# ---------------------------------------------------------------------------

def extract_with_jina(url: str, timeout: int = 30) -> Optional[str]:
    """
    Extract page content as clean Markdown via Jina Reader API.

    Prepends https://r.jina.ai/ to the target URL — no SDK required.
    Returns markdown string on success, None on failure.
    """
    jina_url = f"{JINA_BASE}{url}"
    headers = {
        "Accept": "application/json",
        "X-Return-Format": "markdown",
    }
    if JINA_API_KEY:
        headers["Authorization"] = f"Bearer {JINA_API_KEY}"

    try:
        logger.info(f"[Jina] Fetching: {url}")
        resp = requests.get(jina_url, headers=headers, timeout=timeout)
        resp.raise_for_status()

        data = resp.json()
        # Jina returns {"code": 200, "data": {"content": "..."}}
        if isinstance(data, dict):
            content = (
                data.get("data", {}).get("content")
                or data.get("data")
                or data.get("content")
            )
            if isinstance(content, str) and len(content) > 100:
                logger.info(f"[Jina] Extracted {len(content)} chars")
                return content

        # Fallback: treat body as markdown if not JSON
        if resp.text and len(resp.text) > 100:
            return resp.text

        logger.warning(f"[Jina] Unexpected response structure: {str(data)[:200]}")
        return None

    except requests.exceptions.Timeout:
        logger.warning(f"[Jina] Timeout after {timeout}s for {url}")
        return None
    except requests.exceptions.HTTPError as e:
        logger.warning(f"[Jina] HTTP {e.response.status_code} for {url}")
        return None
    except Exception as e:
        logger.warning(f"[Jina] Failed: {e}")
        return None


# ---------------------------------------------------------------------------
# Layer 2: Crawl4AI (deep crawl — images, links, structure)
# ---------------------------------------------------------------------------

async def _crawl4ai_async(url: str, max_pages: int = 5) -> Optional[dict]:
    """Internal async crawl using Crawl4AI."""
    try:
        from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
        from crawl4ai.extraction_strategy import LLMExtractionStrategy

        browser_cfg = BrowserConfig(headless=True, verbose=False)
        run_cfg = CrawlerRunConfig(
            word_count_threshold=50,
            exclude_external_links=False,
            remove_overlay_elements=True,
        )

        result_data = {
            "markdown": "",
            "images": [],
            "links": [],
            "metadata": {},
            "pages_crawled": 0,
        }

        async with AsyncWebCrawler(config=browser_cfg) as crawler:
            result = await crawler.arun(url=url, config=run_cfg)

            if result.success:
                result_data["markdown"] = result.markdown or ""
                result_data["metadata"] = result.metadata or {}
                result_data["pages_crawled"] = 1

                # Extract images
                if result.media:
                    for img in result.media.get("images", []):
                        src = img.get("src", "")
                        if src:
                            result_data["images"].append(src)

                # Extract links for multi-page crawl
                if result.links:
                    internal = result.links.get("internal", [])
                    result_data["links"] = [
                        lnk.get("href", "") for lnk in internal[:20]
                        if lnk.get("href", "").startswith("http")
                    ]

                logger.info(
                    f"[Crawl4AI] OK: {len(result_data['markdown'])} chars, "
                    f"{len(result_data['images'])} images"
                )
                return result_data

        logger.warning("[Crawl4AI] Crawler returned no result")
        return None

    except ImportError:
        logger.warning("[Crawl4AI] Not installed. Run: pip install crawl4ai")
        return None
    except Exception as e:
        logger.warning(f"[Crawl4AI] Failed: {e}")
        return None


def crawl_with_crawl4ai(url: str, max_pages: int = 5) -> Optional[dict]:
    """
    Deep crawl a website using Crawl4AI.

    Returns dict with keys:
      - markdown: str (extracted text as markdown)
      - images: list[str] (image URLs)
      - links: list[str] (internal page links)
      - metadata: dict (title, description, etc.)
      - pages_crawled: int

    Returns None if Crawl4AI is not installed or fails.
    """
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            # Running inside an async context — use nest_asyncio or thread
            import concurrent.futures
            with concurrent.futures.ThreadPoolExecutor() as pool:
                future = pool.submit(
                    asyncio.run, _crawl4ai_async(url, max_pages)
                )
                return future.result(timeout=60)
        else:
            return loop.run_until_complete(_crawl4ai_async(url, max_pages))
    except Exception as e:
        logger.warning(f"[Crawl4AI] Event loop error: {e}")
        try:
            return asyncio.run(_crawl4ai_async(url, max_pages))
        except Exception as e2:
            logger.warning(f"[Crawl4AI] Final fallback failed: {e2}")
            return None


# ---------------------------------------------------------------------------
# Layer 3: Playwright screenshot
# ---------------------------------------------------------------------------

async def _playwright_screenshot_async(url: str, output_path: Path) -> Optional[Path]:
    """Take a full-page screenshot using Playwright."""
    try:
        from playwright.async_api import async_playwright

        async with async_playwright() as p:
            # Try to use existing chromium
            try:
                browser = await p.chromium.launch(
                    headless=True,
                    args=["--no-sandbox", "--disable-dev-shm-usage"]
                )
            except Exception:
                # Try with system chromium
                browser = await p.chromium.launch(
                    headless=True,
                    executable_path="/usr/bin/chromium-browser",
                    args=["--no-sandbox", "--disable-dev-shm-usage"]
                )

            page = await browser.new_page(
                viewport={"width": 1440, "height": 900}
            )
            await page.goto(url, timeout=60000, wait_until="networkidle")
            await asyncio.sleep(1)  # Let animations settle

            screenshot_path = output_path / "screenshot_original.png"
            await page.screenshot(path=str(screenshot_path), full_page=True)
            await browser.close()

            logger.info(f"[Playwright] Screenshot saved: {screenshot_path}")
            return screenshot_path

    except ImportError:
        logger.warning("[Playwright] Not installed")
        return None
    except Exception as e:
        logger.warning(f"[Playwright] Screenshot failed: {e}")
        return None


def screenshot_with_playwright(url: str, output_dir: Path) -> Optional[Path]:
    """
    Take a full-page screenshot of a URL using Playwright.

    Saves to output_dir/screenshot_original.png
    Returns the path if successful, None otherwise.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    try:
        return asyncio.run(_playwright_screenshot_async(url, output_dir))
    except RuntimeError:
        # Event loop already running
        import concurrent.futures
        with concurrent.futures.ThreadPoolExecutor() as pool:
            future = pool.submit(
                asyncio.run, _playwright_screenshot_async(url, output_dir)
            )
            try:
                return future.result(timeout=60)
            except Exception as e:
                logger.warning(f"[Playwright] Thread fallback failed: {e}")
                return None


# ---------------------------------------------------------------------------
# Layer 4: Raw requests fallback
# ---------------------------------------------------------------------------

def extract_raw_html(url: str) -> Optional[str]:
    """
    Basic HTML extraction via requests as last resort.
    Returns raw HTML string or None.
    """
    try:
        logger.info(f"[Raw] Fetching HTML: {url}")
        resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=20)
        resp.raise_for_status()
        return resp.text
    except Exception as e:
        logger.warning(f"[Raw] Failed: {e}")
        return None


# ---------------------------------------------------------------------------
# Orchestrated extraction with fallbacks
# ---------------------------------------------------------------------------

def extract_content(url: str, output_dir: Optional[Path] = None) -> dict:
    """
    Extract content from URL using layered fallback strategy.

    Priority:
      1. Jina Reader (fast, free, LLM-ready markdown)
      2. Crawl4AI (deep crawl if Jina fails or returns too little)
      3. Raw HTML (last resort for basic content)

    Screenshot is always attempted independently for visual reference.

    Returns dict:
      - markdown: str
      - html: str (raw HTML if available)
      - images: list[str]
      - links: list[str]
      - metadata: dict
      - screenshot_path: str or None
      - extraction_method: str
    """
    result = {
        "markdown": "",
        "html": "",
        "images": [],
        "links": [],
        "metadata": {},
        "screenshot_path": None,
        "extraction_method": "none",
        "url": url,
    }

    # --- Layer 1: Jina ---
    logger.info(f"[Extractor] Starting extraction for: {url}")
    jina_content = extract_with_jina(url)

    if jina_content and len(jina_content) > 500:
        result["markdown"] = jina_content
        result["extraction_method"] = "jina"
        logger.info(f"[Extractor] Jina succeeded ({len(jina_content)} chars)")
    else:
        logger.info("[Extractor] Jina insufficient, trying Crawl4AI...")

        # --- Layer 2: Crawl4AI ---
        crawl_data = crawl_with_crawl4ai(url)
        if crawl_data and crawl_data.get("markdown") and len(crawl_data["markdown"]) > 300:
            result["markdown"] = crawl_data["markdown"]
            result["images"] = crawl_data.get("images", [])
            result["links"] = crawl_data.get("links", [])
            result["metadata"] = crawl_data.get("metadata", {})
            result["extraction_method"] = "crawl4ai"
            logger.info(f"[Extractor] Crawl4AI succeeded ({len(result['markdown'])} chars)")
        else:
            # --- Layer 3: Raw HTML ---
            logger.info("[Extractor] Crawl4AI insufficient, trying raw HTML...")
            html = extract_raw_html(url)
            if html:
                result["html"] = html
                # Strip tags for basic markdown
                import re
                text = re.sub(r"<[^>]+>", " ", html)
                text = re.sub(r"\s+", " ", text).strip()
                result["markdown"] = text[:8000]
                result["extraction_method"] = "raw_html"
                logger.info(f"[Extractor] Raw HTML fallback ({len(html)} chars)")
            else:
                logger.error(f"[Extractor] ALL extraction methods failed for {url}")

    # --- Screenshot (always try, independent of text extraction) ---
    if output_dir:
        try:
            screenshot_path = screenshot_with_playwright(url, output_dir)
            if screenshot_path:
                result["screenshot_path"] = str(screenshot_path)
        except Exception as e:
            logger.warning(f"[Extractor] Screenshot skipped: {e}")

    return result


# ---------------------------------------------------------------------------
# CLI test
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

    url = sys.argv[1] if len(sys.argv) > 1 else "https://example.com"
    output = Path("/mnt/e/genesis-system/scripts/clone_pipeline/output/test")
    output.mkdir(parents=True, exist_ok=True)

    result = extract_content(url, output)
    print(f"\nExtraction result:")
    print(f"  Method: {result['extraction_method']}")
    print(f"  Markdown length: {len(result['markdown'])}")
    print(f"  Images found: {len(result['images'])}")
    print(f"  Screenshot: {result['screenshot_path']}")
    print(f"\nFirst 500 chars of content:\n{result['markdown'][:500]}")
