#!/usr/bin/env python3
"""
LocalSearch Brisbane Concreters Scraper
Scrapes concreters in Brisbane, QLD from localsearch.com.au.
"""
import csv, time, os, re, logging
from datetime import date
from pathlib import Path

OUTPUT_FILE = "/mnt/e/genesis-system/data/LEADS/localsearch_brisbane_concreters.csv"
ERROR_LOG = "/mnt/e/genesis-system/data/LEADS/localsearch_brisbane_concreters_errors.log"
TODAY = date.today().isoformat()

TRADES = ["concreters"]

CITIES = ["brisbane-qld"]

logging.basicConfig(filename=ERROR_LOG, level=logging.ERROR, 
                    format="%(asctime)s %(message)s")

# Load existing phones to avoid dupes
existing_phones = set()
if Path(OUTPUT_FILE).exists():
    with open(OUTPUT_FILE, encoding="utf-8", errors="replace") as f:
        # Assuming header is already present, skip it for DictReader
        reader = csv.reader(f)
        header = next(reader, None) # read header
        if header:
            # Find the index of the phone column
            try:
                phone_col_index = header.index("phone")
                for row in reader:
                    if len(row) > phone_col_index:
                        p = row[phone_col_index].strip()
                        if p:
                            existing_phones.add(p)
            except ValueError:
                print("Phone column not found in existing CSV. Starting fresh.")
        else:
            print("Existing CSV is empty or has no header. Starting fresh.")

print(f"Existing unique phones: {len(existing_phones):,}")

try:
    from bs4 import BeautifulSoup
    import requests
    
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-AU,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://www.localsearch.com.au/"
    })
    
    total_new = 0
    
    # Write header only if file does not exist or is empty
    if not Path(OUTPUT_FILE).exists() or Path(OUTPUT_FILE).stat().st_size == 0:
        with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as fout:
            writer = csv.writer(fout)
            writer.writerow([
                "business_name", "trade_category", "phone", "website",
                "street_address", "suburb", "state", "postcode", "email",
                "rating", "review_count", "source_url"
            ])

    with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as fout:
        writer = csv.writer(fout)
        
        for trade in TRADES:
            for city in CITIES:
                page = 1
                city_trade_count = 0
                
                while True:
                    url = f"https://www.localsearch.com.au/find/{trade}/{city}"
                    if page > 1:
                        url += f"?page={page}"
                    
                    try:
                        resp = session.get(url, timeout=15)
                        if resp.status_code == 405:
                            logging.error(f"405 blocked: {url}")
                            print(f"  BLOCKED (405): {url}")
                            break
                        if resp.status_code != 200:
                            logging.error(f"HTTP {resp.status_code}: {url}")
                            break
                        
                        soup = BeautifulSoup(resp.text, "html.parser")
                        
                        # Find business listings
                        listings = (soup.find_all("div", attrs={"data-testid": "listing-card"}) or 
                                   soup.find_all("article", class_=re.compile(r"listing|business|card")) or 
                                   soup.find_all("div", class_=re.compile(r"listing-item|business-card|search-result")))
                        
                        if not listings:
                            # Try JSON-LD
                            scripts = soup.find_all("script", type="application/ld+json")
                            # No listings found - end pagination
                            if page == 1:
                                print(f"  No listings found: {trade}/{city}")
                            break
                        
                        page_new = 0
                        for listing in listings:
                            # Extract fields
                            name_el = (listing.find(attrs={"data-testid": "listing-name"}) or 
                                      listing.find(class_=re.compile(r"business-name|listing-name|company-name")))
                            phone_el = (listing.find(attrs={"data-testid": "listing-phone"}) or 
                                       listing.find("a", href=re.compile(r"^tel:")) or 
                                       listing.find(class_=re.compile(r"phone|contact-number")))
                            web_el = (listing.find("a", attrs={"data-testid": "website-link"}) or 
                                     listing.find("a", rel="nofollow", href=re.compile(r"^http")))
                            addr_el = (listing.find(attrs={"data-testid": "listing-address"}) or 
                                      listing.find(class_=re.compile(r"address|location")))
                            rating_el = (listing.find(attrs={"data-testid": "rating"}) or 
                                        listing.find(class_=re.compile(r"rating|stars")))
                            review_el = listing.find(class_=re.compile(r"review-count|reviews"))
                            link_el = listing.find("a", href=re.compile(r"/profile/"))
                            
                            name = name_el.get_text(strip=True) if name_el else ""
                            
                            phone = ""
                            if phone_el:
                                href = phone_el.get("href", "")
                                if href.startswith("tel:"):
                                    phone = href.replace("tel:", "").strip()
                                else:
                                    phone = phone_el.get_text(strip=True)
                            phone = re.sub(r"[^\d+]", "", phone)
                            
                            website = web_el.get("href", "") if web_el else ""
                            
                            address = addr_el.get_text(strip=True) if addr_el else ""
                            suburb = city.replace("-qld","").replace("-"," ").title()
                            state = "QLD"
                            
                            postcode = ""
                            postcode_match = re.search(r'\b\d{4}\b', address)
                            if postcode_match:
                                postcode = postcode_match.group(0)

                            email = "" # Email extraction is complex, leaving blank for now
                            
                            rating = ""
                            if rating_el:
                                m = re.search(r"[\d.]+", rating_el.get_text())
                                if m: rating = m.group()
                            
                            reviews = ""
                            if review_el:
                                m = re.search(r"\d+", review_el.get_text())
                                if m: reviews = m.group()
                            
                            source_url = ""
                            if link_el:
                                href = link_el.get("href","")
                                source_url = f"https://www.localsearch.com.au{href}" if href.startswith("/") else href
                            if not source_url:
                                source_url = url
                            
                            if not name:
                                continue
                            
                            if phone and phone in existing_phones:
                                continue
                            
                            if phone:
                                existing_phones.add(phone)
                            
                            writer.writerow([
                                name, trade, phone, website, address, suburb, state,
                                postcode, email, rating, reviews, source_url
                            ])
                            page_new += 1
                            city_trade_count += 1
                            total_new += 1
                        
                        print(f"  {trade}/{city} page {page}: +{page_new} new (total this combo: {city_trade_count})")
                        fout.flush()
                        
                        if page_new == 0:
                            break  # No new results, stop paginating
                        
                        page += 1
                        time.sleep(2.5)
                        
                    except Exception as e:
                        logging.error(f"Error on {url}: {e}")
                        print(f"  ERROR: {url} -> {e}")
                        break
                
                if city_trade_count > 0:
                    print(f"  DONE: {trade}/{city} = {city_trade_count} new listings")
    
    print(f"\nTOTAL NEW RECORDS ADDED: {total_new:,}")
    print(f"Total unique phones now: {len(existing_phones):,}")
    
except ImportError as e:
    print(f"Missing dependency: {e}")
    print("Install with: pip install requests beautifulsoup4")
