import asyncio
import csv
import re
import random
from playwright.async_api import async_playwright, TimeoutError
import os

async def extract_listing_data(listing, page_url):
    business_name = await listing.get_attribute('data-business-name')
    trade_category = "Plumber" # Hardcoded as per the task description

    try:
        phone_element = await listing.query_selector('a[href^="tel:"]')
        phone = await phone_element.get_attribute('href') if phone_element else ''
        phone = phone.replace('tel:', '') if phone else ''
    except Exception:
        phone = ''

    try:
        website_element = await listing.query_selector('a[data-ga-action="Website"]')
        website = await website_element.get_attribute('href') if website_element else ''
    except Exception:
        website = ''
    
    street_address, suburb, state, postcode = '', '', '', ''
    try:
        address_element = await listing.query_selector('address')
        if address_element:
            full_address_text = await address_element.text_content()
            # Parse the address to extract components
            parts = [p.strip() for p in full_address_text.split(',')]
            if len(parts) >= 2:
                street_address = parts[0]
                last_part = parts[-1]
                
                # Regex to extract suburb, state, and postcode from the last part
                match = re.search(r'^(.*?)\s*([A-Za-z]{2,3})\s*(\d{4})$', last_part)
                if match:
                    suburb = match.group(1).strip()
                    state = match.group(2).strip()
                    postcode = match.group(3).strip()
                else:
                    # Fallback for simpler cases or if regex fails
                    space_split = last_part.split()
                    if len(space_split) >= 3:
                        postcode = space_split[-1]
                        state = space_split[-2]
                        suburb = ' '.join(space_split[:-2])
                    elif len(space_split) == 2:
                        state = space_split[0]
                        postcode = space_split[1]
                    else:
                        suburb = last_part # If all else fails, put everything in suburb

            if not street_address and len(parts) > 0: # If street address is empty but there are parts
                street_address = parts[0]
    except Exception as e:
        print(f"Error parsing address: {e}")
        pass # Keep defaults

    email = '' # Email is not directly visible on localsearch listings without clicking through

    rating = ''
    try:
        # Look for the span with aria-label containing "average rating"
        rating_span = await listing.query_selector('span[aria-label$="average rating"]')
        if rating_span:
            aria_label = await rating_span.get_attribute('aria-label')
            if aria_label:
                rating_match = re.search(r'(\d+(\.\d+)?)\s+average rating', aria_label)
                if rating_match:
                    rating = rating_match.group(1)
    except Exception:
        pass

    review_count = ''
    try:
        # Find the element that often contains the review count, usually near the rating
        # This might need refinement based on exact HTML structure
        review_count_element = await listing.query_selector('p.sc-kpDqfm.cWCAF')
        if review_count_element:
            review_count_text = await review_count_element.text_content()
            review_count_match = re.search(r'\((\d+)\s+review', review_count_text) # Matches "(X review" or "(X reviews"
            if review_count_match:
                review_count = review_count_match.group(1)
    except Exception:
        pass

    return {
        "business_name": business_name,
        "trade_category": trade_category,
        "phone": phone,
        "website": website,
        "street_address": street_address,
        "suburb": suburb,
        "state": state,
        "postcode": postcode,
        "email": email,
        "rating": rating,
        "review_count": review_count,
        "source_url": page_url,
    }

async def scrape_localsearch():
    all_listings_data = []
    base_url = "https://www.localsearch.com.au/find/plumbers/brisbane-qld"

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=[] # No special args
        )
        context = await browser.new_context() # No custom user agent or locale
        page = await context.new_page()

        # No resource blocking
        
        page_num = 1
        while True:
            url = f"{base_url}?page={page_num}"
            print(f"Navigating to {url}")
            try:
                # Reverted to domcontentloaded with longer timeouts
                await page.goto(url, wait_until="domcontentloaded", timeout=120000)
                await page.wait_for_selector('div[data-testid="listing-card"]', timeout=90000) 
            except TimeoutError:
                print(f"Timeout navigating to {url} or waiting for listing cards. It might be the end or an issue. Ending scrape.")
                break 
            except Exception as e:
                print(f"An error occurred navigating to {url}: {e}")
                break

            listings = await page.query_selector_all('div[data-testid="listing-card"]')
            if not listings:
                print(f"No listings found on {url}. Ending scrape.")
                break

            for listing in listings:
                try:
                    data = await extract_listing_data(listing, url)
                    all_listings_data.append(data)
                except Exception as e:
                    print(f"Error extracting data from a listing on {url}: {e}")
                    continue

            # Pagination logic remains the same
            next_page_button = await page.query_selector('a[aria-label="Next page"]')
            if not next_page_button:
                print("No 'Next page' button found. Assuming end of pagination.")
                break
            
            next_page_href = await next_page_button.get_attribute('href')
            if not next_page_href or f"page={page_num}" in next_page_href:
                print("Next page button is disabled or points to current page. Ending scrape.")
                break

            current_page_indicator = await page.query_selector(f'a[aria-current="page"]')
            if current_page_indicator:
                current_page_number_text = await current_page_indicator.text_content()
                try:
                    current_page_number = int(current_page_number_text)
                    if current_page_number >= 24: 
                        print(f"Reached page {current_page_number}. Ending scrape as per ~24 pages limit.")
                        break
                except ValueError:
                    print("Could not parse current page number. Continuing.")
            
            page_num += 1
            await asyncio.sleep(random.uniform(2, 3)) # Rate limit

        await browser.close()
    
    output_dir = "/mnt/e/genesis-system/data/LEADS"
    os.makedirs(output_dir, exist_ok=True)
    csv_file_path = os.path.join(output_dir, "localsearch_brisbane_plumbers.csv")

    if all_listings_data:
        keys = all_listings_data[0].keys()
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(all_listings_data)
        print(f"Scraping complete. Extracted {len(all_listings_data)} listings.")
        print(f"Data saved to {csv_file_path}")
    else:
        print("No data extracted.")

if __name__ == "__main__":
    asyncio.run(scrape_localsearch())