"""
Tradie Website Auditor
======================
Given a list of tradie businesses, audits their web presence and scores quality.

Output: CSV with columns:
    business_name, phone, suburb, trade_type,
    has_website, website_url, website_score,
    is_mobile_friendly, has_contact_form, has_reviews,
    page_speed_estimate, priority_tier, priority_score

Priority tiers:
    PRIME  — No website at all. Easiest win.
    HIGH   — Has website but score < 40. Ripe for replacement.
    MEDIUM — Score 40-65. Convincible.
    LOW    — Score 66+. Already decent, harder sell.

Usage:
    python tradie_website_auditor.py --input leads.csv --output audited_leads.csv
    python tradie_website_auditor.py --business-name "Smith Plumbing" --phone "0412345678" --suburb "Parramatta" --trade plumber
"""

import argparse
import csv
import json
import re
import socket
import sys
import time
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
GOOGLE_SEARCH_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0.0.0 Safari/537.36"
    )
}
REQUEST_TIMEOUT = 10  # seconds
SEARCH_DELAY = 1.5    # polite crawl delay between Google requests

# Mobile-friendliness indicators in meta viewport
MOBILE_VIEWPORT_PATTERN = re.compile(r'width=device-width', re.IGNORECASE)

# Review platform signals
REVIEW_SIGNALS = [
    "google.com/maps", "facebook.com/reviews", "productreview.com.au",
    "hipages.com.au", "oneflare.com.au", "trustpilot.com",
    "stars", "review", "rating", "testimonial"
]

# Contact form indicators
CONTACT_FORM_SIGNALS = [
    "<form", "contact", "enquir", "quote", "booking",
    "name", "email", "phone", "message", "submit"
]

# Trade-specific search terms
TRADE_KEYWORDS = {
    "plumber": ["plumber", "plumbing", "drain", "pipe"],
    "electrician": ["electrician", "electrical", "wiring", "switchboard"],
    "builder": ["builder", "building", "renovation", "construction"],
    "painter": ["painter", "painting", "interior", "exterior"],
    "carpenter": ["carpenter", "carpentry", "joinery", "cabinet"],
    "tiler": ["tiler", "tiling", "tiles", "floor tiles"],
    "roofer": ["roofer", "roofing", "roof repair", "gutters"],
    "landscaper": ["landscaper", "landscaping", "garden", "lawn"],
    "hvac": ["hvac", "air conditioning", "aircon", "heating", "cooling"],
    "cleaner": ["cleaner", "cleaning", "carpet cleaning", "house cleaning"],
}


# ---------------------------------------------------------------------------
# Website Discovery
# ---------------------------------------------------------------------------
def search_business_website(business_name: str, suburb: str, trade_type: str) -> Optional[str]:
    """
    Search Google for the business website.
    Returns URL string or None if not found.
    """
    query = f'"{business_name}" {suburb} {trade_type} site'
    url = f"https://www.google.com/search?q={requests.utils.quote(query)}&num=5"

    try:
        resp = requests.get(url, headers=GOOGLE_SEARCH_HEADERS, timeout=REQUEST_TIMEOUT)
        if resp.status_code != 200:
            return None

        soup = BeautifulSoup(resp.text, "html.parser")

        # Extract organic result links — filter Google's own domains
        for a_tag in soup.select("a[href]"):
            href = a_tag.get("href", "")
            # Google wraps links in /url?q=...
            if href.startswith("/url?q="):
                raw = href[7:]
                parsed = urlparse(raw)
                domain = parsed.netloc.lower()
                # Skip ad/Google domains and known directories
                skip_domains = [
                    "google.", "facebook.", "instagram.", "linkedin.",
                    "yellowpages.", "whitepages.", "truelocal.", "yelp.",
                    "hipages.", "oneflare.", "seek.", "realestate.",
                    "domain.com", "reddit.", "youtube."
                ]
                if any(s in domain for s in skip_domains):
                    continue
                if domain and "." in domain:
                    return f"{parsed.scheme}://{parsed.netloc}"
        return None

    except Exception as exc:
        print(f"  [WARN] Google search failed for '{business_name}': {exc}", file=sys.stderr)
        return None


def check_dns_exists(domain: str) -> bool:
    """Quick DNS check to confirm a domain resolves."""
    try:
        hostname = urlparse(domain).netloc or domain
        socket.gethostbyname(hostname)
        return True
    except socket.gaierror:
        return False


# ---------------------------------------------------------------------------
# Website Scoring
# ---------------------------------------------------------------------------
def score_website(url: str) -> dict:
    """
    Fetch the website and score it on multiple dimensions.
    Returns a dict with score components and total.
    """
    result = {
        "website_url": url,
        "is_mobile_friendly": False,
        "has_contact_form": False,
        "has_reviews": False,
        "page_speed_estimate": "unknown",
        "https": False,
        "score": 0,
        "fetch_error": None,
    }

    try:
        start = time.time()
        resp = requests.get(
            url,
            headers=GOOGLE_SEARCH_HEADERS,
            timeout=REQUEST_TIMEOUT,
            allow_redirects=True
        )
        elapsed = time.time() - start

        # HTTPS check (10 pts)
        if resp.url.startswith("https://"):
            result["https"] = True
            result["score"] += 10

        # Page speed estimate (20 pts)
        if elapsed < 1.5:
            result["page_speed_estimate"] = "fast"
            result["score"] += 20
        elif elapsed < 3.0:
            result["page_speed_estimate"] = "moderate"
            result["score"] += 10
        else:
            result["page_speed_estimate"] = "slow"
            result["score"] += 0

        html_lower = resp.text.lower()
        soup = BeautifulSoup(resp.text, "html.parser")

        # Mobile viewport (20 pts)
        viewport_meta = soup.find("meta", attrs={"name": "viewport"})
        if viewport_meta and MOBILE_VIEWPORT_PATTERN.search(str(viewport_meta)):
            result["is_mobile_friendly"] = True
            result["score"] += 20

        # Contact form (20 pts)
        form_count = len(soup.find_all("form"))
        contact_signals_hit = sum(1 for s in CONTACT_FORM_SIGNALS if s in html_lower)
        if form_count > 0 or contact_signals_hit >= 3:
            result["has_contact_form"] = True
            result["score"] += 20

        # Reviews / testimonials (15 pts)
        review_signals_hit = sum(1 for s in REVIEW_SIGNALS if s in html_lower)
        if review_signals_hit >= 2:
            result["has_reviews"] = True
            result["score"] += 15

        # Reasonable content length (15 pts) — thin sites score low
        content_length = len(resp.text)
        if content_length > 15000:
            result["score"] += 15
        elif content_length > 5000:
            result["score"] += 7

    except requests.exceptions.SSLError:
        result["fetch_error"] = "ssl_error"
        result["score"] = 5   # Has a site but broken HTTPS
    except requests.exceptions.ConnectionError:
        result["fetch_error"] = "connection_error"
    except requests.exceptions.Timeout:
        result["fetch_error"] = "timeout"
        result["score"] = 5
    except Exception as exc:
        result["fetch_error"] = str(exc)

    return result


# ---------------------------------------------------------------------------
# Priority Calculation
# ---------------------------------------------------------------------------
def calculate_priority(has_website: bool, website_score: int) -> tuple[str, int]:
    """
    Returns (priority_tier, priority_score).
    priority_score: 0-100, higher = better target for outreach.
    """
    if not has_website:
        return "PRIME", 100
    if website_score < 30:
        return "HIGH", 85
    if website_score < 50:
        return "HIGH", 70
    if website_score < 66:
        return "MEDIUM", 50
    return "LOW", 25


# ---------------------------------------------------------------------------
# Single Business Audit
# ---------------------------------------------------------------------------
def audit_business(
    business_name: str,
    phone: str,
    suburb: str,
    trade_type: str
) -> dict:
    """
    Full audit for a single business.
    Returns a flat dict ready for CSV output.
    """
    print(f"  Auditing: {business_name} ({suburb})...", end="", flush=True)
    time.sleep(SEARCH_DELAY)

    # Step 1: Find website
    website_url = search_business_website(business_name, suburb, trade_type)
    has_website = False
    website_score = 0
    score_details = {}

    if website_url:
        # Confirm it actually resolves
        if check_dns_exists(website_url):
            has_website = True
            score_details = score_website(website_url)
            website_score = score_details.get("score", 0)
        else:
            website_url = None

    priority_tier, priority_score = calculate_priority(has_website, website_score)

    row = {
        "business_name": business_name,
        "phone": phone,
        "suburb": suburb,
        "trade_type": trade_type,
        "has_website": has_website,
        "website_url": website_url or "",
        "website_score": website_score,
        "is_mobile_friendly": score_details.get("is_mobile_friendly", False),
        "has_contact_form": score_details.get("has_contact_form", False),
        "has_reviews": score_details.get("has_reviews", False),
        "page_speed_estimate": score_details.get("page_speed_estimate", "n/a"),
        "https": score_details.get("https", False),
        "priority_tier": priority_tier,
        "priority_score": priority_score,
        "fetch_error": score_details.get("fetch_error", ""),
    }

    status = f"[{priority_tier}] score={website_score}" if has_website else "[NO WEBSITE - PRIME TARGET]"
    print(f" {status}")
    return row


# ---------------------------------------------------------------------------
# Batch Audit from CSV
# ---------------------------------------------------------------------------
REQUIRED_COLUMNS = {"business_name", "phone", "suburb", "trade_type"}
OUTPUT_COLUMNS = [
    "business_name", "phone", "suburb", "trade_type",
    "has_website", "website_url", "website_score",
    "is_mobile_friendly", "has_contact_form", "has_reviews",
    "page_speed_estimate", "https",
    "priority_tier", "priority_score", "fetch_error"
]


def audit_from_csv(input_path: Path, output_path: Path) -> list[dict]:
    """Process a CSV file of leads and write audited results."""
    results = []

    with open(input_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        missing = REQUIRED_COLUMNS - set(reader.fieldnames or [])
        if missing:
            raise ValueError(f"Input CSV missing columns: {missing}")

        rows = list(reader)

    print(f"[AUDITOR] Processing {len(rows)} leads from {input_path.name}")
    print("-" * 60)

    for row in rows:
        result = audit_business(
            business_name=row["business_name"].strip(),
            phone=row.get("phone", "").strip(),
            suburb=row.get("suburb", "").strip(),
            trade_type=row.get("trade_type", "tradie").strip().lower(),
        )
        results.append(result)

    # Sort by priority_score descending
    results.sort(key=lambda x: x["priority_score"], reverse=True)

    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=OUTPUT_COLUMNS)
        writer.writeheader()
        writer.writerows(results)

    # Summary
    prime_count = sum(1 for r in results if r["priority_tier"] == "PRIME")
    high_count = sum(1 for r in results if r["priority_tier"] == "HIGH")
    print("-" * 60)
    print(f"[AUDITOR] Complete. {len(results)} businesses audited.")
    print(f"  PRIME targets (no website): {prime_count}")
    print(f"  HIGH targets (bad website): {high_count}")
    print(f"  Results written to: {output_path}")
    return results


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
    parser = argparse.ArgumentParser(
        description="Audit tradie businesses for web presence quality."
    )
    subparsers = parser.add_subparsers(dest="mode", required=True)

    # Batch CSV mode
    batch = subparsers.add_parser("batch", help="Audit from CSV file")
    batch.add_argument("--input", required=True, help="Input CSV path")
    batch.add_argument("--output", required=True, help="Output CSV path")

    # Single business mode
    single = subparsers.add_parser("single", help="Audit a single business")
    single.add_argument("--business-name", required=True)
    single.add_argument("--phone", default="")
    single.add_argument("--suburb", required=True)
    single.add_argument("--trade", required=True,
                        choices=list(TRADE_KEYWORDS.keys()) + ["tradie"])

    args = parser.parse_args()

    if args.mode == "batch":
        audit_from_csv(Path(args.input), Path(args.output))
    else:
        result = audit_business(
            business_name=args.business_name,
            phone=args.phone,
            suburb=args.suburb,
            trade_type=args.trade,
        )
        print(json.dumps(result, indent=2))


if __name__ == "__main__":
    main()
