#!/usr/bin/env python3
"""
LocalSearch P1 Scraper - 9 Remaining Trade Categories
Confirmed working extraction method (2026-02-19 test pass).

Scrapes: concreters, roofing-contractors, landscapers, painters,
         tilers, fencing-contractors, carpenters, air-conditioning, builders
Across: 12 QLD cities
"""
import csv, time, re, logging, os, sys
from datetime import date
from pathlib import Path

OUTPUT_FILE  = "/mnt/e/genesis-system/data/LEADS/localsearch_tradies.csv"
ERROR_LOG    = "/mnt/e/genesis-system/data/LEADS/localsearch_errors.log"
PROGRESS_LOG = "/mnt/e/genesis-system/data/LEADS/p1_scrape_progress.log"
TODAY        = date.today().isoformat()

TRADES = [
    "concreters", "roofing-contractors", "landscapers", "painters",
    "tilers", "fencing-contractors", "carpenters", "air-conditioning", "builders"
]

CITIES = [
    "brisbane-qld", "gold-coast-qld", "sunshine-coast-qld", "cairns-qld",
    "townsville-qld", "toowoomba-qld", "mackay-qld", "rockhampton-qld",
    "bundaberg-qld", "hervey-bay-qld", "gladstone-qld", "mount-isa-qld"
]

logging.basicConfig(
    filename=ERROR_LOG,
    level=logging.ERROR,
    format="%(asctime)s %(message)s"
)

def log_progress(msg):
    print(msg)
    with open(PROGRESS_LOG, "a") as f:
        f.write(f"{TODAY} {msg}\n")

def extract_listings_from_page(soup, url, trade, city_slug):
    """
    Extract business listings using confirmed phone-first approach.
    Returns list of dicts with business data.
    """
    records = []
    phone_links = soup.find_all("a", href=re.compile(r"^tel:"))

    for a in phone_links:
        phone_raw = a.get("href", "").replace("tel:", "").strip()
        phone = re.sub(r"[^\d+]", "", phone_raw)
        if not phone or len(phone) < 8:
            continue

        # Walk up DOM to find the business container
        container = a.find_parent()
        for _ in range(5):  # up to 5 levels
            if container and container.find(["h2", "h3", "h4"]):
                break
            if container:
                container = container.find_parent()

        if not container:
            continue

        # Business name
        name_el = container.find(["h2", "h3", "h4", "strong"])
        name = name_el.get_text(strip=True) if name_el else ""
        if not name:
            continue

        # Profile/source URL
        profile_link = container.find("a", href=re.compile(r"/profile/"))
        if profile_link:
            href = profile_link.get("href", "")
            source_url = f"https://www.localsearch.com.au{href}" if href.startswith("/") else href
        else:
            source_url = url

        # Website (external link, not localsearch)
        web_el = container.find("a", href=re.compile(r"^https?://(?!.*(localsearch))"))
        website = web_el.get("href", "") if web_el else ""

        # Address text
        container_text = container.get_text(separator=" ", strip=True)
        # Look for QLD suburb patterns
        addr_match = re.search(r"(\d+\s+[\w\s]+(?:Rd|St|Ave|Dr|Cres|Ct|Pl|Blvd|Hwy)[,\s]+[\w\s]+QLD)", container_text)
        address = addr_match.group(1) if addr_match else ""

        # Rating
        rating_match = re.search(r"(\d+\.\d+)\s*(?:star|/5|out of)", container_text, re.IGNORECASE)
        rating = rating_match.group(1) if rating_match else ""

        # Review count
        review_match = re.search(r"(\d+)\s+(?:review|rating)", container_text, re.IGNORECASE)
        reviews = review_match.group(1) if review_match else ""

        suburb_clean = city_slug.replace("-qld", "").replace("-", " ").title()

        records.append({
            "business_name": name,
            "trade_category": trade,
            "phone": phone,
            "website": website,
            "street_address": address,
            "suburb": suburb_clean,
            "state": "QLD",
            "postcode": "",
            "email": "",
            "rating": rating,
            "review_count": reviews,
            "source_url": source_url,
            "scraped_date": TODAY
        })

    return records


def run_scraper():
    import requests
    from bs4 import BeautifulSoup

    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-AU,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
    })

    # Load existing phones
    existing_phones = set()
    if Path(OUTPUT_FILE).exists():
        with open(OUTPUT_FILE, encoding="utf-8", errors="replace") as f:
            for row in csv.DictReader(f):
                p = re.sub(r"[^\d+]", "", row.get("phone", ""))
                if p and len(p) >= 8:
                    existing_phones.add(p)
    log_progress(f"START: Loaded {len(existing_phones):,} existing phones")

    FIELDNAMES = ["business_name","trade_category","phone","website","street_address",
                  "suburb","state","postcode","email","rating","review_count","source_url","scraped_date"]

    total_new = 0
    consecutive_blocks = 0

    with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as fout:
        writer = csv.DictWriter(fout, fieldnames=FIELDNAMES)

        for trade in TRADES:
            trade_new = 0
            log_progress(f"\n=== TRADE: {trade.upper()} ===")

            for city in CITIES:
                page = 1
                city_new = 0
                no_result_pages = 0

                while True:
                    url = f"https://www.localsearch.com.au/find/{trade}/{city}"
                    if page > 1:
                        url += f"?page={page}"

                    try:
                        resp = session.get(url, timeout=20)

                        if resp.status_code in (403, 405, 429):
                            consecutive_blocks += 1
                            logging.error(f"Blocked {resp.status_code}: {url}")
                            wait = min(60 * consecutive_blocks, 180)
                            log_progress(f"  BLOCKED ({resp.status_code}) {trade}/{city} p{page} - wait {wait}s")
                            if consecutive_blocks >= 3:
                                log_progress(f"  Too many blocks on {trade}/{city}, skipping city")
                                break
                            time.sleep(wait)
                            continue

                        if resp.status_code != 200:
                            logging.error(f"HTTP {resp.status_code}: {url}")
                            log_progress(f"  HTTP {resp.status_code}: {trade}/{city} p{page}")
                            break

                        consecutive_blocks = 0
                        soup = BeautifulSoup(resp.text, "html.parser")
                        records = extract_listings_from_page(soup, url, trade, city)

                        page_new = 0
                        for rec in records:
                            phone_key = re.sub(r"[^\d+]", "", rec["phone"])
                            if phone_key in existing_phones:
                                continue
                            existing_phones.add(phone_key)
                            writer.writerow(rec)
                            page_new += 1
                            city_new += 1
                            trade_new += 1
                            total_new += 1

                        fout.flush()

                        if page_new > 0:
                            no_result_pages = 0
                            log_progress(f"  {trade}/{city} p{page}: +{page_new} ({city_new} this city)")
                        else:
                            no_result_pages += 1
                            if no_result_pages >= 2 or (page == 1 and len(records) == 0):
                                break

                        page += 1
                        time.sleep(2.5 + (0.5 * (page % 3)))  # slight variation

                    except Exception as e:
                        logging.error(f"Exception on {url}: {e}")
                        log_progress(f"  ERROR: {trade}/{city} p{page}: {e}")
                        break

                if city_new > 0:
                    log_progress(f"  DONE: {trade}/{city} = {city_new} new")

            log_progress(f"TRADE TOTAL: {trade} = {trade_new} new records")

    log_progress(f"\n{'='*50}")
    log_progress(f"SCRAPE COMPLETE: {total_new:,} new records added")
    log_progress(f"Total unique phones: {len(existing_phones):,}")
    log_progress(f"Output: {OUTPUT_FILE}")


if __name__ == "__main__":
    run_scraper()
