# scripts/localsearch_brisbane_electricians_scraper.py
import csv, time, re, logging
from datetime import date
from pathlib import Path
from playwright.sync_api import sync_playwright

OUTPUT_FILE = "/mnt/e/genesis-system/data/LEADS/localsearch_brisbane_electricians.csv"
ERROR_LOG = "/mnt/e/genesis-system/data/LEADS/localsearch_brisbane_electricians_errors.log"
TODAY = date.today().isoformat()

TARGET_TRADE = "electricians"
TARGET_CITY = "brisbane-qld"
BASE_URL = f"https://www.localsearch.com.au/find/{TARGET_TRADE}/{TARGET_CITY}"

logging.basicConfig(filename=ERROR_LOG, level=logging.ERROR, 
                    format="%(asctime)s %(message)s")

def extract_address_components(address_string):
    street_address = ""
    suburb = ""
    state = ""
    postcode = ""

    # This regex is a basic attempt and might need refinement for various address formats
    # It tries to capture street, suburb, state (2-3 letters), and 4-digit postcode
    match = re.search(r"^(.*?),\s*([^,]+?)\s*([A-Z]{2,3})\s*(\d{4})$", address_string)
    if match:
        street_address = match.group(1).strip()
        suburb = match.group(2).strip()
        state = match.group(3).strip()
        postcode = match.group(4).strip()
    else:
        # Fallback if regex fails: try to split by comma and guess
        parts = [p.strip() for p in address_string.split(',')]
        if len(parts) >= 3:
            street_address = parts[0]
            # Heuristic for suburb, state, postcode
            last_part = parts[-1]
            state_postcode_match = re.search(r"([A-Z]{2,3})\s*(\d{4})", last_part)
            if state_postcode_match:
                state = state_postcode_match.group(1)
                postcode = state_postcode_match.group(2)
                suburb_candidate = last_part.replace(state_postcode_match.group(0), '').strip()
                if suburb_candidate:
                    suburb = suburb_candidate
                elif len(parts) >= 2:
                    suburb = parts[-2]
            elif len(parts) >= 2:
                suburb = parts[-1] # Assume last part is suburb if no state/postcode found

    return street_address, suburb, state, postcode


def scrape_localsearch():
    total_listings_extracted = 0
    
    # Define CSV headers
    fieldnames = [
        "business_name", "trade_category", "phone", "website", 
        "street_address", "suburb", "state", "postcode", "email", 
        "rating", "review_count", "source_url", "scraped_date"
    ]

    # Ensure the directory exists
    Path(OUTPUT_FILE).parent.mkdir(parents=True, exist_ok=True)

    with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()

            page_num = 1
            while True:
                url = f"{BASE_URL}?page={page_num}" if page_num > 1 else BASE_URL
                print(f"Navigating to: {url}")
                
                try:
                    page.goto(url, wait_until="domcontentloaded", timeout=60000)
                    time.sleep(2.5) # Rate limit delay

                    # Capture screenshot and page content for debugging
                    screenshot_path = f"/home/authentic88/.gemini/tmp/localsearch_debug_page_{page_num}.png"
                    page.screenshot(path=screenshot_path)
                    print(f"Screenshot saved to: {screenshot_path}")

                    html_content_path = f"/home/authentic88/.gemini/tmp/localsearch_debug_page_{page_num}.html"
                    with open(html_content_path, "w", encoding="utf-8") as f:
                        f.write(page.content())
                    print(f"Page HTML content saved to: {html_content_path}")

                    # Wait for at least one listing card to appear
                    try:
                        page.wait_for_selector("div[data-testid='listing-card']", timeout=10000)
                    except Exception as e:
                        logging.warning(f"No listing card found within timeout on {url}: {e}")
                        # Continue to check for "No results found" or break if it's the first page

                    # Check for "No results found" or similar indicators
                    no_results = page.locator("body").filter(has_text="No results found").count() > 0 or \
                                 page.locator("body").filter(has_text="Sorry, we couldn't find any results").count() > 0
                    if no_results and page_num == 1:
                        print(f"No listings found for {TARGET_TRADE} in {TARGET_CITY}.")
                        break
                    elif no_results: # If no results on subsequent pages, means end of listings
                        break


                    listings = page.query_selector_all("div[data-testid='listing-card']")
                    if not listings:
                        # Fallback for other listing card structures
                        listings = page.query_selector_all("article.listing-card")
                    if not listings:
                        # If still no listings, might be end of pagination or a different layout
                        if page_num == 1:
                            print(f"Could not find listings on the first page for {TARGET_TRADE} in {TARGET_CITY}. Exiting.")
                        break

                    listings_on_page = 0
                    for listing_element in listings:
                        business_name = listing_element.query_selector("[data-testid='listing-name']")
                        phone_link = listing_element.query_selector("a[href^='tel:']")
                        website_link = listing_element.query_selector("a[data-testid='website-link']")
                        address_span = listing_element.query_selector("[data-testid='listing-address']")
                        rating_span = listing_element.query_selector("span[data-testid='rating']")
                        review_count_span = listing_element.query_selector(".review-count") # Common class
                        source_url_link = listing_element.query_selector("a[href*='/profile/']")

                        name = business_name.text_content().strip() if business_name else ""
                        phone = phone_link.get_attribute("href").replace("tel:", "").strip() if phone_link else ""
                        website = website_link.get_attribute("href").strip() if website_link else ""
                        address_full = address_span.text_content().strip() if address_span else ""
                        rating = rating_span.text_content().strip() if rating_span else ""
                        review_count_text = review_count_span.text_content().strip() if review_count_span else ""
                        review_count = re.search(r'\d+', review_count_text).group(0) if re.search(r'\d+', review_count_text) else ""
                        
                        src_url = source_url_link.get_attribute("href").strip() if source_url_link else ""
                        if src_url and not src_url.startswith("http"):
                            src_url = f"https://www.localsearch.com.au{src_url}"

                        street_address, suburb, state, postcode = extract_address_components(address_full)
                        
                        # Use the TARGET_CITY as a fallback for suburb if not found in address parsing
                        if not suburb:
                            suburb = TARGET_CITY.replace("-qld", "").replace("-", " ").title()
                        if not state:
                            state = "QLD" # Hardcode QLD as per task

                        # Email is rarely directly visible on listing cards, often requires visiting profile
                        email = "" # Not extracting directly from listing card for now

                        if name and phone: # Basic check for valid listing
                            writer.writerow({
                                "business_name": name,
                                "trade_category": TARGET_TRADE,
                                "phone": phone,
                                "website": website,
                                "street_address": street_address,
                                "suburb": suburb,
                                "state": state,
                                "postcode": postcode,
                                "email": email,
                                "rating": rating,
                                "review_count": review_count,
                                "source_url": src_url,
                                "scraped_date": TODAY
                            })
                            listings_on_page += 1
                            total_listings_extracted += 1

                    if listings_on_page == 0 and page_num > 1:
                        # No new listings found on this page, and it's not the first page, so stop.
                        print(f"No more new listings found on page {page_num}. Ending pagination.")
                        break
                    
                    if listings_on_page < len(listings):
                        # Some listings were filtered out, but there might be more valid ones
                        # So, continue to the next page
                        pass


                    page_num += 1

                except Exception as e:
                    logging.error(f"Error on page {url}: {e}")
                    print(f"ERROR: Could not process {url} -> {e}")
                    break

            browser.close()
    
    print(f"Scraping complete. Total new records added: {total_listings_extracted}")
    return total_listings_extracted

if __name__ == "__main__":
    count = scrape_localsearch()
    print(f"Final summary: Extracted {count} electrician listings from LocalSearch Brisbane.")
