import asyncio
import csv
import re
import random
import os
from playwright.async_api import async_playwright, TimeoutError

TRADES = {
    "plumbers": "plumber",
    "electricians": "electrician",
    "air-conditioning": "air-conditioning",
    "builders": "builder",
    "concreters": "concreter",
    "roofing-contractors": "roofing-contractor",
    "landscapers": "landscaper",
    "painters": "painter",
    "tilers": "tiler",
    "fencing-contractors": "fencing-contractor",
    "carpenters": "carpenter",
}

QLD_CITIES = [
    "brisbane-qld", "gold-coast-qld", "sunshine-coast-qld",
    "cairns-qld", "townsville-qld", "toowoomba-qld",
    "mackay-qld", "rockhampton-qld", "bundaberg-qld",
    "hervey-bay-qld", "gladstone-qld", "mount-isa-qld",
]

async def extract_listing_data(listing, page_url, trade_label):
    business_name = await listing.get_attribute('data-business-name')
    trade_category = trade_label

    try:
        phone_element = await listing.query_selector('a[href^="tel:"]')
        phone = await phone_element.get_attribute('href') if phone_element else ''
        phone = phone.replace('tel:', '') if phone else ''
    except Exception:
        phone = ''

    try:
        website_element = await listing.query_selector('a[data-ga-action="Website"]')
        website = await website_element.get_attribute('href') if website_element else ''
    except Exception:
        website = ''
    
    street_address, suburb, state, postcode = '', '', 'QLD', ''
    try:
        address_element = await listing.query_selector('address')
        if address_element:
            full_address_text = await address_element.text_content()
            parts = [p.strip() for p in full_address_text.split(',')]
            if len(parts) >= 2:
                street_address = parts[0]
                last_part = parts[-1]
                match = re.search(r'^(.*?)\s*([A-Za-z]{2,3})\s*(\d{4})$', last_part)
                if match:
                    suburb = match.group(1).strip()
                    state = match.group(2).strip()
                    postcode = match.group(3).strip()
    except Exception:
        pass

    rating = ''
    try:
        rating_span = await listing.query_selector('span[aria-label$="average rating"]')
        if rating_span:
            aria_label = await rating_span.get_attribute('aria-label')
            if aria_label:
                rating_match = re.search(r'(\d+(\.\d+)?)\s+average rating', aria_label)
                if rating_match:
                    rating = rating_match.group(1)
    except Exception:
        pass

    review_count = ''
    try:
        review_count_element = await listing.query_selector('p.sc-kpDqfm.cWCAF')
        if review_count_element:
            review_count_text = await review_count_element.text_content()
            review_count_match = re.search(r'\((\d+)\s+review', review_count_text)
            if review_count_match:
                review_count = review_count_match.group(1)
    except Exception:
        pass

    return {
        "business_name": business_name,
        "trade_category": trade_category,
        "phone": phone,
        "website": website,
        "street_address": street_address,
        "suburb": suburb,
        "state": state,
        "postcode": postcode,
        "email": '',
        "rating": rating,
        "review_count": review_count,
        "source_url": page_url,
    }

async def scrape_category_city(page, category, city, trade_label, csv_file_path):
    base_url = f"https://www.localsearch.com.au/find/{category}/{city}"
    page_num = 1
    total_extracted = 0

    while True:
        url = f"{base_url}?page={page_num}"
        print(f"Scraping: {url}")
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=60000)
            await page.wait_for_selector('div[data-testid="listing-card"]', timeout=30000)
        except Exception as e:
            print(f"Finished or error at {url}: {e}")
            break

        listings = await page.query_selector_all('div[data-testid="listing-card"]')
        if not listings:
            break

        batch = []
        for listing in listings:
            try:
                data = await extract_listing_data(listing, url, trade_label)
                if data['phone']: # Only keep if phone exists
                    batch.append(data)
            except Exception:
                continue

        if batch:
            file_exists = os.path.isfile(csv_file_path)
            keys = batch[0].keys()
            with open(csv_file_path, 'a', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=keys)
                if not file_exists:
                    writer.writeheader()
                writer.writerows(batch)
            total_extracted += len(batch)

        next_button = await page.query_selector('a[aria-label="Next page"]')
        if not next_button:
            break
        
        page_num += 1
        await asyncio.sleep(random.uniform(1, 2))

    print(f"Completed {category}/{city}: {total_extracted} listings.")
    return total_extracted

async def main():
    output_dir = "/mnt/e/genesis-system/data/LEADS"
    os.makedirs(output_dir, exist_ok=True)
    csv_file_path = os.path.join(output_dir, "localsearch_tradies.csv")
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        total_all = 0
        for category, label in TRADES.items():
            for city in QLD_CITIES:
                count = await scrape_category_city(page, category, city, label, csv_file_path)
                total_all += count
                await asyncio.sleep(random.uniform(2, 5))

        await browser.close()
    print(f"MASTER SCRAPE COMPLETE. Total extracted: {total_all}")

if __name__ == "__main__":
    asyncio.run(main())
