#!/usr/bin/env python3
"""
LocalSearch Brisbane Concreters Scraper using Playwright
Scrapes concreters in Brisbane, QLD from localsearch.com.au.
"""
import csv, time, os, re, logging
from datetime import date
from pathlib import Path
from bs4 import BeautifulSoup
import sys
import random

# Add the scripts directory to the Python path
sys.path.append(str(Path(__file__).parent))

from browser_agent import BrowserAgent

OUTPUT_FILE = "/mnt/e/genesis-system/data/LEADS/localsearch_brisbane_concreters.csv"
ERROR_LOG = "/mnt/e/genesis-system/data/LEADS/localsearch_brisbane_concreters_errors.log"
TODAY = date.today().isoformat()

TRADES = ["concreters"]
CITIES = ["brisbane-qld"]

logging.basicConfig(filename=ERROR_LOG, level=logging.ERROR, 
                    format="%(asctime)s %(message)s",
                    filemode='a') # Append to log file

# Load existing phones to avoid dupes
existing_phones = set()
if Path(OUTPUT_FILE).exists():
    with open(OUTPUT_FILE, encoding="utf-8", errors="replace") as f:
        reader = csv.reader(f)
        header = next(reader, None) # read header
        if header:
            try:
                phone_col_index = header.index("phone")
                for row in reader:
                    if len(row) > phone_col_index:
                        p = row[phone_col_index].strip()
                        if p:
                            existing_phones.add(p)
            except ValueError:
                print("Phone column not found in existing CSV. Starting fresh.")
        else:
            print("Existing CSV is empty or has no header. Starting fresh.")

print(f"Existing unique phones: {len(existing_phones):,}")

total_new = 0

# Initialize BrowserAgent
agent = BrowserAgent(headless=True) # Run in headless mode
try:
    agent.start()
    agent.new_page()

    # Write header only if file does not exist or is empty
    if not Path(OUTPUT_FILE).exists() or Path(OUTPUT_FILE).stat().st_size == 0:
        with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as fout:
            writer = csv.writer(fout)
            writer.writerow([
                "business_name", "trade_category", "phone", "website",
                "street_address", "suburb", "state", "postcode", "email",
                "rating", "review_count", "source_url"
            ])

    with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as fout:
        writer = csv.writer(fout)
        
        for trade in TRADES:
            for city in CITIES:
                page = 1
                city_trade_count = 0
                
                while True:
                    url = f"https://www.localsearch.com.au/find/{trade}/{city}"
                    if page > 1:
                        url += f"?page={page}"
                    
                    try:
                        print(f"Navigating to {url}...")
                        if not agent.navigate(url):
                            logging.error(f"Failed to navigate to {url}")
                            break
                        
                        # Give the page some time to load, instead of waiting for a specific selector
                        time.sleep(5) # Wait 5 seconds
                        
                        soup = BeautifulSoup(agent.get_html(), "html.parser")
                        
                        # Find business listings using more generic selectors
                        # Look for common parent containers that hold a business name, phone, etc.
                        listings = soup.find_all(re.compile(r"div|article"), class_=re.compile(r"card|listing|business|search-result"))
                        
                        if not listings:
                            # Fallback to even more generic div/article search if specific class names don't work
                            # This looks for elements that directly contain an <a> tag with a profile link, or a tel: link
                            listings = soup.find_all(lambda tag: tag.name in ["div", "article"] and (
                                tag.find("a", href=re.compile(r"/profile/")) or tag.find("a", href=re.compile(r"^tel:"))
                            ))

                        if not listings:
                            print(f"  No listings found on page {page} for {trade}/{city}")
                            if page == 1:
                                print(f"  No listings found: {trade}/{city}")
                            break
                        
                        page_new = 0
                        for listing in listings:
                            # Extract fields
                            # Trying more generic ways to find elements
                            name_el = listing.find(re.compile(r"h[1-6]"), class_=re.compile(r"name|title|business-name|company-name")) or \
                                      listing.find("a", class_=re.compile(r"name|title|business-name|company-name")) or \
                                      listing.find("span", class_=re.compile(r"name|title|business-name|company-name")) or \
                                      listing.find("div", class_=re.compile(r"name|title|business-name|company-name"))
                                      
                            phone_el = listing.find("a", href=re.compile(r"^tel:")) or \
                                       listing.find("span", class_=re.compile(r"phone|contact-number")) or \
                                       listing.find("div", class_=re.compile(r"phone|contact-number"))
                                       
                            web_el = listing.find("a", href=re.compile(r"^http"))
                                     
                            addr_el = listing.find("address") or \
                                      listing.find("p", class_=re.compile(r"address|location|locality")) or \
                                      listing.find("div", class_=re.compile(r"address|location|locality"))
                                      
                            rating_el = listing.find(class_=re.compile(r"rating|stars")) or \
                                        listing.find("span", class_=re.compile(r"rating|stars"))
                                        
                            review_el = listing.find(class_=re.compile(r"review-count|reviews")) or \
                                        listing.find("span", class_=re.compile(r"review-count|reviews"))
                                      
                            link_el = listing.find("a", href=re.compile(r"/profile/"))
                            
                            name = name_el.get_text(strip=True) if name_el else ""
                            
                            phone = ""
                            if phone_el:
                                href = phone_el.get("href", "")
                                if href.startswith("tel:"):
                                    phone = href.replace("tel:", "").strip()
                                else:
                                    phone = phone_el.get_text(strip=True)
                            phone = re.sub(r"[^\d+]", "", phone)
                            
                            website = web_el.get("href", "") if web_el else ""
                            
                            address = addr_el.get_text(strip=True) if addr_el else ""
                            suburb = city.replace("-qld","").replace("-"," ").title()
                            state = "QLD"
                            
                            postcode = ""
                            postcode_match = re.search(r'\b\d{4}\b', address)
                            if postcode_match:
                                postcode = postcode_match.group(0)

                            email = "" # Email extraction is complex, leaving blank for now
                            
                            rating = ""
                            if rating_el:
                                m = re.search(r"[\d.]+", rating_el.get_text())
                                if m: rating = m.group()
                            
                            reviews = ""
                            if review_el:
                                m = re.search(r"\d+", review_el.get_text())
                                if m: reviews = m.group()
                            
                            source_url = ""
                            if link_el:
                                href = link_el.get("href","")
                                source_url = f"https://www.localsearch.com.au{href}" if href.startswith("/") else href
                            if not source_url:
                                source_url = url
                            
                            if not name:
                                continue
                            
                            if phone and phone in existing_phones:
                                continue
                            
                            if phone:
                                existing_phones.add(phone)
                            
                            writer.writerow([
                                name, trade, phone, website, address, suburb, state,
                                postcode, email, rating, reviews, source_url
                            ])
                            page_new += 1
                            city_trade_count += 1
                            total_new += 1
                        
                        print(f"  {trade}/{city} page {page}: +{page_new} new (total this combo: {city_trade_count})")
                        fout.flush()
                        
                        if page_new == 0:
                            break  # No new results, stop paginating
                        
                        page += 1
                        time.sleep(random.uniform(2.0, 3.0)) # Random sleep between 2 and 3 seconds
                        
                    except Exception as e:
                        logging.error(f"Error on {url}: {e}")
                        print(f"  ERROR: {url} -> {e}")
                        break
                
                if city_trade_count > 0:
                    print(f"  DONE: {trade}/{city} = {city_trade_count} new listings")
    
    print(f"\nTOTAL NEW RECORDS ADDED: {total_new:,}")
    print(f"Total unique phones now: {len(existing_phones):,}")
    
except Exception as e:
    logging.critical(f"Critical error during scraping: {e}")
    print(f"CRITICAL ERROR: {e}")
finally:
    agent.stop()