
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import random

def fetch_page(url, delay=2):
    time.sleep(delay + random.uniform(0, 1))  # Add some random jitter
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_listing(listing_div, trade_category, source_url):
    business_name = listing_div.find('h3', class_='listing-card__title').text.strip() if listing_div.find('h3', class_='listing-card__title') else None
    
    phone = listing_div.find('a', class_='listing-card__phone-link').text.strip() if listing_div.find('a', class_='listing-card__phone-link') else None
    
    website_link = listing_div.find('a', class_='listing-card__website-link')
    website = website_link['href'] if website_link else None

    # Address components
    address_div = listing_div.find('address', class_='listing-card__address')
    street_address = None
    suburb = None
    state = None
    postcode = None
    if address_div:
        address_parts = [part.strip() for part in address_div.get_text(separator=',').split(',')]
        # This is a bit brittle, might need more robust parsing depending on address formats
        # Assuming format like: Street Address, Suburb, State Postcode
        if len(address_parts) >= 3:
            street_address = address_parts[0]
            suburb = address_parts[1]
            state_postcode = address_parts[2].split()
            if len(state_postcode) == 2:
                state = state_postcode[0]
                postcode = state_postcode[1]
            elif len(state_postcode) == 1: # Sometimes only postcode or state might be present
                if re.match(r'\d{4}', state_postcode[0]):
                    postcode = state_postcode[0]
                else:
                    state = state_postcode[0]
    
    # Rating and review count
    rating_span = listing_div.find('span', class_='rating-static-v2__value')
    rating = float(rating_span.text.strip()) if rating_span else None
    
    review_count_span = listing_div.find('span', class_='listing-card__reviews-count')
    review_count = int(re.search(r'\d+', review_count_span.text).group()) if review_count_span and re.search(r'\d+', review_count_span.text) else 0

    # Email is usually not directly visible, requires clicking through or is not present
    # For now, we'll leave it as None
    email = None

    return {
        'business_name': business_name,
        'trade_category': trade_category,
        'phone': phone,
        'website': website,
        'street_address': street_address,
        'suburb': suburb,
        'state': state,
        'postcode': postcode,
        'email': email,
        'rating': rating,
        'review_count': review_count,
        'source_url': source_url,
    }

def scrape_category(base_url, trade_category):
    all_listings = []
    page_num = 1
    while True:
        url = f"{base_url}?page={page_num}" if page_num > 1 else base_url
        print(f"Scraping {trade_category} - Page {page_num}: {url}")
        page_content = fetch_page(url)
        if not page_content:
            break

        soup = BeautifulSoup(page_content, 'html.parser')
        listings = soup.find_all('div', class_='listing-card__content')
        
        if not listings:
            print(f"No listings found on page {page_num} for {trade_category}. Ending scrape for this category.")
            break

        for listing_div in listings:
            parsed_data = parse_listing(listing_div, trade_category, url)
            all_listings.append(parsed_data)
        
        # Check for next page link or if current page is the last one
        next_button = soup.find('a', class_='pagination-button--next')
        if not next_button:
            print(f"No next page button found for {trade_category}. Ending scrape for this category.")
            break
        
        # Simple check if next button is disabled
        if 'disabled' in next_button.get('class', []):
            print(f"Next page button is disabled for {trade_category}. Ending scrape for this category.")
            break
            
        page_num += 1
    return all_listings

if __name__ == "__main__":
    categories = {
        'Landscapers': 'https://www.localsearch.com.au/find/landscapers/brisbane-qld',
        'Tilers': 'https://www.localsearch.com.au/find/tilers/brisbane-qld'
    }

    all_scraped_data = []
    category_counts = {}

    for trade_category, url in categories.items():
        listings = scrape_category(url, trade_category)
        all_scraped_data.extend(listings)
        category_counts[trade_category] = len(listings)
        print(f"Finished scraping {trade_category}. Found {len(listings)} listings.")

    df = pd.DataFrame(all_scraped_data)
    output_path = '/mnt/e/genesis-system/data/LEADS/localsearch_brisbane_landscapers_tilers.csv'
    
    # Ensure the directory exists
    import os
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    df.to_csv(output_path, index=False)
    print(f"\nScraping complete. Data saved to {output_path}")
    for category, count in category_counts.items():
        print(f"{category}: {count} listings")
