"""
scraper.py — Data collection from PageSpeed Insights, Brave Search, and direct web scraping.

All functions return dict or None on failure. Designed for async batch processing.
"""

import asyncio
import json
import ssl
import re
import logging
from typing import Optional
from urllib.parse import urlparse, quote_plus

import aiohttp
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
BRAVE_API_KEY = "BSAmQFROygK29bcFmz88zpOXRwnHk3Q"
REQUEST_TIMEOUT = aiohttp.ClientTimeout(total=20)
PAGESPEED_TIMEOUT = aiohttp.ClientTimeout(total=30)  # PageSpeed can be slow

# Common user-agent to avoid bot blocks on direct scraping
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)

HEADERS = {"User-Agent": USER_AGENT}


def _normalize_url(url: str) -> str:
    """Ensure URL has scheme and is clean."""
    if not url:
        return ""
    url = url.strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    return url


# ---------------------------------------------------------------------------
# PageSpeed Insights (FREE — no API key required)
# ---------------------------------------------------------------------------
async def fetch_pagespeed(
    session: aiohttp.ClientSession, url: str
) -> Optional[dict]:
    """
    Fetch Google PageSpeed Insights data for a URL.
    Returns dict with mobile_score, desktop_score, load_time, fcp, lcp, etc.
    """
    url = _normalize_url(url)
    if not url:
        return None

    results = {}

    for i, strategy in enumerate(("mobile", "desktop")):
        if i > 0:
            await asyncio.sleep(0.5)  # Small delay between strategies to avoid rate limits
        api_url = (
            f"https://pagespeedonline.googleapis.com/pagespeedonline/v5/runPagespeed"
            f"?url={quote_plus(url)}&strategy={strategy}"
            f"&category=PERFORMANCE&category=ACCESSIBILITY&category=BEST_PRACTICES&category=SEO"
        )
        try:
            # Retry with backoff for rate limits (429)
            data = None
            for attempt in range(3):
                async with session.get(
                    api_url, timeout=PAGESPEED_TIMEOUT
                ) as resp:
                    if resp.status == 429:
                        wait = 2 ** attempt + 1
                        logger.info("PageSpeed rate limited, waiting %ds...", wait)
                        await asyncio.sleep(wait)
                        continue
                    if resp.status != 200:
                        logger.warning(
                            "PageSpeed %s returned %d for %s", strategy, resp.status, url
                        )
                        break
                    data = await resp.json()
                    break

            if data is None:
                continue

            # Extract Lighthouse scores (0-1 range, convert to 0-100)
            categories = data.get("lighthouseResult", {}).get("categories", {})
            perf = categories.get("performance", {})
            seo_cat = categories.get("seo", {})
            a11y = categories.get("accessibility", {})
            bp = categories.get("best-practices", {})

            # Extract audit metrics
            audits = data.get("lighthouseResult", {}).get("audits", {})
            fcp = audits.get("first-contentful-paint", {}).get("numericValue", 0)
            lcp = audits.get("largest-contentful-paint", {}).get("numericValue", 0)
            speed_index = audits.get("speed-index", {}).get("numericValue", 0)
            tti = audits.get("interactive", {}).get("numericValue", 0)
            cls_val = audits.get("cumulative-layout-shift", {}).get("numericValue", 0)

            # Mobile viewport check
            viewport_audit = audits.get("viewport", {})
            has_viewport = viewport_audit.get("score", 0) == 1

            prefix = strategy
            results[f"{prefix}_performance_score"] = round(
                perf.get("score", 0) * 100
            )
            results[f"{prefix}_seo_score"] = round(
                seo_cat.get("score", 0) * 100
            )
            results[f"{prefix}_accessibility_score"] = round(
                a11y.get("score", 0) * 100
            )
            results[f"{prefix}_best_practices_score"] = round(
                bp.get("score", 0) * 100
            )
            results[f"{prefix}_fcp_ms"] = round(fcp)
            results[f"{prefix}_lcp_ms"] = round(lcp)
            results[f"{prefix}_speed_index_ms"] = round(speed_index)
            results[f"{prefix}_tti_ms"] = round(tti)
            results[f"{prefix}_cls"] = round(cls_val, 3)

            if strategy == "mobile":
                results["has_mobile_viewport"] = has_viewport

        except asyncio.TimeoutError:
            logger.warning("PageSpeed %s timed out for %s", strategy, url)
        except Exception as e:
            logger.warning("PageSpeed %s error for %s: %s", strategy, url, e)

    return results if results else None


# ---------------------------------------------------------------------------
# Brave Search API
# ---------------------------------------------------------------------------
async def fetch_brave_search(
    session: aiohttp.ClientSession,
    business_name: str,
    location: str = "Australia",
) -> Optional[dict]:
    """
    Search Brave for the business to assess online presence.
    Returns result_count, social_links found, snippet data.
    """
    if not business_name:
        return None

    query = f"{business_name} {location}"
    api_url = f"https://api.search.brave.com/res/v1/web/search?q={quote_plus(query)}&count=10"

    headers = {
        "Accept": "application/json",
        "Accept-Encoding": "gzip",
        "X-Subscription-Token": BRAVE_API_KEY,
    }

    try:
        async with session.get(
            api_url, headers=headers, timeout=REQUEST_TIMEOUT
        ) as resp:
            if resp.status != 200:
                logger.warning("Brave Search returned %d for %s", resp.status, query)
                return None

            data = await resp.json()
            web_results = data.get("web", {}).get("results", [])
            total_results = data.get("web", {}).get("totalEstimatedMatches", 0)

            # Detect social media profiles
            social_platforms = {
                "facebook.com": "Facebook",
                "instagram.com": "Instagram",
                "twitter.com": "Twitter",
                "x.com": "Twitter/X",
                "linkedin.com": "LinkedIn",
                "youtube.com": "YouTube",
                "tiktok.com": "TikTok",
                "yelp.com": "Yelp",
            }

            social_links = []
            has_google_business = False
            has_directory_listings = False

            directory_domains = [
                "yellowpages", "truelocal", "hotfrog", "localsearch",
                "oneflare", "hipages", "serviceseeking", "airtasker",
                "productreview", "wordofmouth",
            ]

            for result in web_results:
                result_url = result.get("url", "").lower()
                result_desc = result.get("description", "").lower()

                for domain, platform in social_platforms.items():
                    if domain in result_url:
                        social_links.append(platform)

                if "google.com/maps" in result_url or "business.google" in result_url:
                    has_google_business = True

                for d in directory_domains:
                    if d in result_url:
                        has_directory_listings = True
                        break

                # Check descriptions for Google reviews mentions
                if "google review" in result_desc or "google rating" in result_desc:
                    has_google_business = True

            # Extract review/rating info from snippets
            review_mentions = 0
            for result in web_results:
                desc = result.get("description", "")
                if re.search(r"\d+\s*review", desc, re.IGNORECASE):
                    review_mentions += 1
                if re.search(r"\d(\.\d)?\s*star", desc, re.IGNORECASE):
                    review_mentions += 1

            return {
                "search_result_count": min(total_results, 10000),
                "top_results": len(web_results),
                "social_profiles": list(set(social_links)),
                "social_profile_count": len(set(social_links)),
                "has_google_business": has_google_business,
                "has_directory_listings": has_directory_listings,
                "review_mentions": review_mentions,
            }

    except asyncio.TimeoutError:
        logger.warning("Brave Search timed out for %s", business_name)
        return None
    except Exception as e:
        logger.warning("Brave Search error for %s: %s", business_name, e)
        return None


# ---------------------------------------------------------------------------
# Direct Website Scraping
# ---------------------------------------------------------------------------
async def fetch_website_basics(
    session: aiohttp.ClientSession, url: str
) -> Optional[dict]:
    """
    Scrape basic website data: SSL, viewport, title, meta, headings, etc.
    """
    url = _normalize_url(url)
    if not url:
        return None

    try:
        # Check SSL
        parsed = urlparse(url)
        has_ssl = parsed.scheme == "https"

        # Also try HTTPS version if HTTP
        fetch_url = url
        if not has_ssl:
            https_url = url.replace("http://", "https://", 1)
            try:
                async with session.head(
                    https_url, timeout=aiohttp.ClientTimeout(total=8),
                    allow_redirects=True
                ) as test_resp:
                    if test_resp.status < 400:
                        has_ssl = True
                        fetch_url = https_url
            except Exception:
                pass  # HTTPS not available

        # Fetch the page
        async with session.get(
            fetch_url,
            headers=HEADERS,
            timeout=REQUEST_TIMEOUT,
            allow_redirects=True,
            ssl=False,  # Don't fail on bad SSL certs
        ) as resp:
            if resp.status >= 400:
                logger.warning("Website returned %d for %s", resp.status, url)
                return {
                    "has_ssl": has_ssl,
                    "status_code": resp.status,
                    "is_reachable": False,
                }

            html = await resp.text(errors="replace")
            final_url = str(resp.url)

            # Check if redirected to HTTPS
            if final_url.startswith("https://"):
                has_ssl = True

        soup = BeautifulSoup(html, "html.parser")

        # Title tag
        title_tag = soup.find("title")
        title_text = title_tag.get_text(strip=True) if title_tag else ""

        # Meta description
        meta_desc = ""
        meta_tag = soup.find("meta", attrs={"name": re.compile(r"description", re.I)})
        if meta_tag:
            meta_desc = meta_tag.get("content", "")

        # Viewport meta (mobile-friendly indicator)
        viewport_tag = soup.find("meta", attrs={"name": re.compile(r"viewport", re.I)})
        has_viewport = viewport_tag is not None

        # Heading analysis
        h1_tags = soup.find_all("h1")
        h2_tags = soup.find_all("h2")
        h1_count = len(h1_tags)
        h2_count = len(h2_tags)
        h1_text = h1_tags[0].get_text(strip=True) if h1_tags else ""

        # Image analysis
        images = soup.find_all("img")
        images_without_alt = sum(1 for img in images if not img.get("alt"))
        total_images = len(images)

        # Links analysis
        all_links = soup.find_all("a", href=True)
        external_links = [
            a for a in all_links
            if a["href"].startswith(("http://", "https://"))
            and urlparse(a["href"]).netloc != urlparse(fetch_url).netloc
        ]

        # Social links on the page
        social_domains = [
            "facebook.com", "instagram.com", "twitter.com", "x.com",
            "linkedin.com", "youtube.com", "tiktok.com",
        ]
        page_social_links = []
        for link in all_links:
            href = link.get("href", "").lower()
            for sd in social_domains:
                if sd in href:
                    page_social_links.append(sd.split(".")[0])

        # Schema markup
        schema_scripts = soup.find_all("script", attrs={"type": "application/ld+json"})
        has_schema = len(schema_scripts) > 0

        # Open Graph tags
        og_tags = soup.find_all("meta", attrs={"property": re.compile(r"^og:", re.I)})
        has_og_tags = len(og_tags) > 0

        # Check for chat widget / live chat indicators
        chat_indicators = [
            "livechat", "tawk", "intercom", "drift", "crisp",
            "zendesk", "hubspot", "olark", "freshchat", "tidio",
        ]
        page_text_lower = html.lower()
        has_chat_widget = any(ci in page_text_lower for ci in chat_indicators)

        # Check for booking/scheduling
        booking_indicators = [
            "calendly", "acuity", "bookings", "schedule",
            "appointment", "book-now", "book_now",
        ]
        has_booking = any(bi in page_text_lower for bi in booking_indicators)

        # Phone number on page
        phone_pattern = re.compile(
            r"(\+?\d{1,4}[\s\-]?\(?\d{1,4}\)?[\s\-]?\d{2,4}[\s\-]?\d{2,4}[\s\-]?\d{0,4})"
        )
        phones_on_page = phone_pattern.findall(html)
        has_phone_visible = len(phones_on_page) > 0

        # Content length (rough word count)
        body = soup.find("body")
        body_text = body.get_text(separator=" ", strip=True) if body else ""
        word_count = len(body_text.split())

        # ----- Competitor provider detection (poach intelligence) -----
        provider_signatures = {
            "localsearch": {"name": "LocalSearch", "est_cost": 400},
            "localsearch.com.au": {"name": "LocalSearch", "est_cost": 400},
            "sensis": {"name": "Sensis/Yellow Pages", "est_cost": 350},
            "yellowpages.com.au": {"name": "Sensis/Yellow Pages", "est_cost": 350},
            "yell.com": {"name": "Yell", "est_cost": 300},
            "oneflare": {"name": "Oneflare", "est_cost": 200},
            "hipages": {"name": "hipages", "est_cost": 250},
            "servicecentral": {"name": "ServiceCentral", "est_cost": 200},
            "truelocal": {"name": "TrueLocal/Sensis", "est_cost": 300},
            "webjet": {"name": "Webjet", "est_cost": 300},
            "squarespace": {"name": "Squarespace (DIY)", "est_cost": 40},
            "wix.com": {"name": "Wix (DIY)", "est_cost": 35},
            "weebly": {"name": "Weebly (DIY)", "est_cost": 30},
            "godaddy.com/websites": {"name": "GoDaddy Builder", "est_cost": 25},
            "shopify": {"name": "Shopify", "est_cost": 50},
            "wordpress.com": {"name": "WordPress.com", "est_cost": 30},
            "wp-content": {"name": "WordPress (self-hosted)", "est_cost": 150},
            "developer.starter": {"name": "GoDaddy Website Builder", "est_cost": 25},
        }

        detected_provider = None
        est_monthly_cost = None
        # Check HTML source, footer text, and meta generator
        meta_generator = soup.find("meta", attrs={"name": re.compile(r"generator", re.I)})
        generator_text = meta_generator.get("content", "").lower() if meta_generator else ""
        footer_el = soup.find("footer")
        footer_text = footer_el.get_text(separator=" ", strip=True).lower() if footer_el else ""
        check_text = f"{page_text_lower} {generator_text} {footer_text}"

        for sig, info in provider_signatures.items():
            if sig in check_text:
                detected_provider = info["name"]
                est_monthly_cost = info["est_cost"]
                break

        return {
            "is_reachable": True,
            "status_code": 200,
            "has_ssl": has_ssl,
            "has_mobile_viewport": has_viewport,
            "title_tag": title_text[:200],
            "has_title": bool(title_text),
            "meta_description": meta_desc[:300],
            "has_meta_description": bool(meta_desc),
            "h1_count": h1_count,
            "h1_text": h1_text[:200],
            "h2_count": h2_count,
            "total_images": total_images,
            "images_without_alt": images_without_alt,
            "has_schema_markup": has_schema,
            "has_og_tags": has_og_tags,
            "has_chat_widget": has_chat_widget,
            "has_booking_system": has_booking,
            "has_phone_visible": has_phone_visible,
            "page_social_links": list(set(page_social_links)),
            "external_link_count": len(external_links),
            "word_count": word_count,
            "final_url": final_url,
            "detected_provider": detected_provider,
            "est_monthly_cost": est_monthly_cost,
        }

    except asyncio.TimeoutError:
        logger.warning("Website timed out for %s", url)
        return {"is_reachable": False, "has_ssl": False, "status_code": 0}
    except Exception as e:
        logger.warning("Website scrape error for %s: %s", url, e)
        return {"is_reachable": False, "has_ssl": False, "status_code": 0}


# ---------------------------------------------------------------------------
# Master scrape function
# ---------------------------------------------------------------------------
async def scrape_all(
    url: str,
    business_name: str,
    location: str = "Australia",
    session: Optional[aiohttp.ClientSession] = None,
) -> dict:
    """
    Run all scrapers concurrently for a single business.
    Returns combined data dict with keys: pagespeed, brave, website.
    """
    close_session = False
    if session is None:
        connector = aiohttp.TCPConnector(ssl=False, limit=10)
        session = aiohttp.ClientSession(connector=connector)
        close_session = True

    try:
        # Run all three scrapers concurrently
        pagespeed_task = fetch_pagespeed(session, url)
        brave_task = fetch_brave_search(session, business_name, location)
        website_task = fetch_website_basics(session, url)

        pagespeed_data, brave_data, website_data = await asyncio.gather(
            pagespeed_task, brave_task, website_task,
            return_exceptions=True
        )

        # Handle exceptions from gather
        if isinstance(pagespeed_data, Exception):
            logger.warning("PageSpeed exception: %s", pagespeed_data)
            pagespeed_data = None
        if isinstance(brave_data, Exception):
            logger.warning("Brave exception: %s", brave_data)
            brave_data = None
        if isinstance(website_data, Exception):
            logger.warning("Website exception: %s", website_data)
            website_data = None

        return {
            "business_name": business_name,
            "url": url,
            "location": location,
            "pagespeed": pagespeed_data or {},
            "brave": brave_data or {},
            "website": website_data or {},
        }

    finally:
        if close_session:
            await session.close()
