#!/usr/bin/env python3
"""
Tradie Lead Scraper v2 — ReceptionistAI Distribution Pipeline
============================================================
Scrapes LocalSearch.com.au for high-value Australian trade businesses.
Stores results in Elestio PostgreSQL (tradie_leads table).
Designed to run daily via scheduler.

Targets: plumbers, electricians, builders, HVAC in
Brisbane, Sydney, Melbourne, Cairns, Gold Coast, Perth, Adelaide

Usage:
    python tradie_lead_scraper_v2.py                  # Run full scrape
    python tradie_lead_scraper_v2.py --city brisbane  # Single city
    python tradie_lead_scraper_v2.py --trade plumber  # Single trade
    python tradie_lead_scraper_v2.py --setup-db       # Create table only
"""

import sys
import os
import time
import random
import logging
import argparse
import hashlib
from datetime import datetime, date
from typing import Optional

# Add genesis-memory to path for Elestio config (handles both WSL and Windows)
import platform
if platform.system() == 'Windows':
    sys.path.insert(0, r'E:\genesis-system\data\genesis-memory')
else:
    sys.path.insert(0, '/mnt/e/genesis-system/data/genesis-memory')

import requests
from bs4 import BeautifulSoup
import psycopg2
from psycopg2.extras import execute_values
from elestio_config import PostgresConfig

# ─── Logging ──────────────────────────────────────────────────────────────────

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(
            r'E:\genesis-system\logs\tradie_scraper_v2.log' if platform.system() == 'Windows'
            else '/mnt/e/genesis-system/logs/tradie_scraper_v2.log',
            encoding='utf-8'
        )
    ]
)
log = logging.getLogger('tradie_scraper_v2')

# ─── Config ───────────────────────────────────────────────────────────────────

TARGETS = {
    'cities': [
        {'name': 'Brisbane', 'slug': 'brisbane-city-qld-4000', 'state': 'QLD', 'area_code': '07'},
        {'name': 'Cairns',   'slug': 'cairns-qld-4870',        'state': 'QLD', 'area_code': '07'},
        {'name': 'Gold Coast','slug': 'gold-coast-qld-4217',   'state': 'QLD', 'area_code': '07'},
        {'name': 'Sydney',   'slug': 'sydney-nsw-2000',        'state': 'NSW', 'area_code': '02'},
        {'name': 'Melbourne','slug': 'melbourne-vic-3000',     'state': 'VIC', 'area_code': '03'},
        {'name': 'Perth',    'slug': 'perth-wa-6000',          'state': 'WA',  'area_code': '08'},
        {'name': 'Adelaide', 'slug': 'adelaide-sa-5000',       'state': 'SA',  'area_code': '08'},
    ],
    'trades': [
        {'name': 'plumber',     'slug': 'plumbers'},
        {'name': 'electrician', 'slug': 'electricians'},
        {'name': 'builder',     'slug': 'builders'},
        {'name': 'hvac',        'slug': 'air-conditioning-installation-service'},
        {'name': 'concreter',   'slug': 'concreters'},
    ]
}

HEADERS = {
    'User-Agent': (
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/121.0.0.0 Safari/537.36'
    ),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-AU,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://www.localsearch.com.au/',
}

BASE_URL = 'https://www.localsearch.com.au/find'
MAX_PAGES = 5            # pages per city/trade combo
DELAY_MIN = 2.5          # seconds between requests (min)
DELAY_MAX = 6.0          # seconds between requests (max)
REQUEST_TIMEOUT = 20     # seconds
RETRY_ATTEMPTS = 3


# ─── Database ─────────────────────────────────────────────────────────────────

CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS tradie_leads (
    id                  SERIAL PRIMARY KEY,
    lead_hash           VARCHAR(64) UNIQUE NOT NULL,  -- dedup key: name+phone
    business_name       TEXT NOT NULL,
    trade               VARCHAR(100),
    city                VARCHAR(100),
    state               VARCHAR(10),
    area_code           VARCHAR(5),
    suburb              VARCHAR(200),
    phone               VARCHAR(50),
    email               VARCHAR(255),
    website             VARCHAR(500),
    rating              NUMERIC(3,1),
    review_count        INTEGER DEFAULT 0,
    listing_url         TEXT,
    scrape_source       VARCHAR(100) DEFAULT 'localsearch.com.au',
    outreach_status     VARCHAR(50)  DEFAULT 'not_contacted',
    outreach_sent_at    TIMESTAMP,
    outreach_reply_at   TIMESTAMP,
    outreach_channel    VARCHAR(50),  -- email / sms / manual
    notes               TEXT,
    priority_score      INTEGER DEFAULT 0,  -- computed score for outreach ordering
    created_at          TIMESTAMP DEFAULT NOW(),
    updated_at          TIMESTAMP DEFAULT NOW()
);

CREATE INDEX IF NOT EXISTS idx_tradie_leads_city    ON tradie_leads(city);
CREATE INDEX IF NOT EXISTS idx_tradie_leads_trade   ON tradie_leads(trade);
CREATE INDEX IF NOT EXISTS idx_tradie_leads_status  ON tradie_leads(outreach_status);
CREATE INDEX IF NOT EXISTS idx_tradie_leads_hash    ON tradie_leads(lead_hash);
CREATE INDEX IF NOT EXISTS idx_tradie_leads_score   ON tradie_leads(priority_score DESC);
"""

UPDATE_TIMESTAMP_SQL = """
CREATE OR REPLACE FUNCTION update_tradie_leads_updated_at()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = NOW();
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

DROP TRIGGER IF EXISTS tradie_leads_updated_at ON tradie_leads;
CREATE TRIGGER tradie_leads_updated_at
    BEFORE UPDATE ON tradie_leads
    FOR EACH ROW
    EXECUTE FUNCTION update_tradie_leads_updated_at();
"""


def get_db_connection():
    """Return a psycopg2 connection to Elestio PostgreSQL."""
    params = PostgresConfig.get_connection_params()
    return psycopg2.connect(**params)


def setup_database():
    """Create the tradie_leads table and indexes if they don't exist."""
    log.info("Setting up tradie_leads table...")
    conn = get_db_connection()
    try:
        with conn.cursor() as cur:
            cur.execute(CREATE_TABLE_SQL)
            try:
                cur.execute(UPDATE_TIMESTAMP_SQL)
            except Exception:
                pass  # Trigger already exists — fine
        conn.commit()
        log.info("Database setup complete.")
    finally:
        conn.close()


def compute_lead_hash(business_name: str, phone: str) -> str:
    """Unique dedup key from name + phone."""
    key = f"{business_name.lower().strip()}|{phone.strip()}"
    return hashlib.sha256(key.encode()).hexdigest()[:64]


def compute_priority_score(rating: Optional[float], review_count: int, has_email: bool) -> int:
    """
    Priority score 0-100 for outreach ordering.
    Higher = better prospect for ReceptionistAI.
    Logic: businesses with solid ratings + decent reviews are busy = need receptionist.
    """
    score = 0
    if rating is not None:
        if rating >= 4.5:
            score += 30
        elif rating >= 4.0:
            score += 20
        elif rating >= 3.5:
            score += 10
    if review_count >= 50:
        score += 30
    elif review_count >= 20:
        score += 20
    elif review_count >= 10:
        score += 10
    elif review_count >= 5:
        score += 5
    if has_email:
        score += 20
    # Proven business — high review count = high call volume = ideal prospect
    if review_count >= 100:
        score += 20
    return min(score, 100)


# ─── Scraping ─────────────────────────────────────────────────────────────────

def make_request(url: str, session: requests.Session) -> Optional[BeautifulSoup]:
    """Fetch a URL and return parsed BeautifulSoup, with retries."""
    for attempt in range(RETRY_ATTEMPTS):
        try:
            resp = session.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
            if resp.status_code == 200:
                return BeautifulSoup(resp.text, 'html.parser')
            elif resp.status_code == 429:
                wait = 30 + random.uniform(10, 30)
                log.warning(f"Rate limited. Waiting {wait:.0f}s...")
                time.sleep(wait)
            elif resp.status_code in (403, 404):
                log.warning(f"HTTP {resp.status_code} for {url}")
                return None
            else:
                log.warning(f"HTTP {resp.status_code} for {url}")
        except requests.Timeout:
            log.warning(f"Timeout on attempt {attempt + 1} for {url}")
        except requests.RequestException as e:
            log.warning(f"Request error: {e}")

        if attempt < RETRY_ATTEMPTS - 1:
            time.sleep(random.uniform(5, 12))

    log.error(f"All {RETRY_ATTEMPTS} attempts failed for {url}")
    return None


def parse_listing_card(card) -> Optional[dict]:
    """
    Parse a single business listing card from LocalSearch.
    Returns a dict or None if parsing fails.
    """
    try:
        lead = {}

        # Business name — multiple selectors to handle layout variations
        name_el = (
            card.select_one('h2.listing-name') or
            card.select_one('[class*="business-name"]') or
            card.select_one('h2') or
            card.select_one('[data-business-name]')
        )
        if not name_el:
            return None
        lead['business_name'] = name_el.get_text(strip=True)
        if not lead['business_name']:
            return None

        # Phone
        phone_el = (
            card.select_one('[class*="phone"]') or
            card.select_one('[href^="tel:"]') or
            card.select_one('[data-phone]')
        )
        lead['phone'] = ''
        if phone_el:
            phone = phone_el.get('href', '') or phone_el.get_text(strip=True)
            lead['phone'] = phone.replace('tel:', '').strip()

        # Email
        email_el = card.select_one('[href^="mailto:"]')
        lead['email'] = ''
        if email_el:
            email = email_el.get('href', '').replace('mailto:', '').strip()
            lead['email'] = email if '@' in email else ''

        # Website
        website_el = card.select_one('[class*="website"]') or card.select_one('[data-website]')
        lead['website'] = ''
        if website_el:
            lead['website'] = website_el.get('href', '').strip()

        # Suburb / address
        addr_el = (
            card.select_one('[class*="address"]') or
            card.select_one('[class*="suburb"]') or
            card.select_one('[itemprop="streetAddress"]')
        )
        lead['suburb'] = addr_el.get_text(strip=True) if addr_el else ''

        # Rating
        lead['rating'] = None
        rating_el = (
            card.select_one('[class*="rating"]') or
            card.select_one('[itemprop="ratingValue"]') or
            card.select_one('[class*="star"]')
        )
        if rating_el:
            rating_text = rating_el.get('content') or rating_el.get_text(strip=True)
            try:
                lead['rating'] = float(rating_text.split()[0])
            except (ValueError, IndexError):
                pass

        # Review count
        lead['review_count'] = 0
        review_el = (
            card.select_one('[class*="review-count"]') or
            card.select_one('[itemprop="reviewCount"]') or
            card.select_one('[class*="reviews"]')
        )
        if review_el:
            count_text = review_el.get('content') or review_el.get_text(strip=True)
            try:
                digits = ''.join(c for c in count_text if c.isdigit())
                lead['review_count'] = int(digits) if digits else 0
            except ValueError:
                pass

        # Listing URL
        link_el = card.select_one('a[href*="/find/"]') or card.select_one('a')
        lead['listing_url'] = ''
        if link_el:
            href = link_el.get('href', '')
            if href.startswith('http'):
                lead['listing_url'] = href
            elif href.startswith('/'):
                lead['listing_url'] = f"https://www.localsearch.com.au{href}"

        return lead

    except Exception as e:
        log.debug(f"Card parse error: {e}")
        return None


def scrape_city_trade(city: dict, trade: dict, session: requests.Session) -> list:
    """Scrape all pages for a city/trade combo. Returns list of lead dicts."""
    leads = []
    city_name = city['name']
    trade_name = trade['name']

    for page in range(1, MAX_PAGES + 1):
        url = f"{BASE_URL}/{trade['slug']}/{city['slug']}"
        if page > 1:
            url += f"?page={page}"

        log.info(f"Scraping: {city_name} / {trade_name} — page {page}")
        soup = make_request(url, session)

        if not soup:
            log.warning(f"No response for {url}, stopping pagination.")
            break

        # Find listing cards — LocalSearch uses various class patterns
        cards = (
            soup.select('[class*="listing-card"]') or
            soup.select('[class*="business-card"]') or
            soup.select('article[class*="listing"]') or
            soup.select('.listing-result') or
            soup.select('[data-listing-id]')
        )

        if not cards:
            log.info(f"No cards found on page {page} — end of results.")
            break

        page_leads = 0
        for card in cards:
            lead = parse_listing_card(card)
            if lead:
                lead.update({
                    'trade': trade_name,
                    'city': city_name,
                    'state': city['state'],
                    'area_code': city['area_code'],
                })
                leads.append(lead)
                page_leads += 1

        log.info(f"  Found {page_leads} leads on page {page}")

        if page_leads == 0:
            break

        # Polite delay between pages
        delay = random.uniform(DELAY_MIN, DELAY_MAX)
        log.debug(f"  Sleeping {delay:.1f}s...")
        time.sleep(delay)

    return leads


# ─── Database Upsert ──────────────────────────────────────────────────────────

def upsert_leads(leads: list, conn) -> tuple:
    """
    Insert leads into tradie_leads, skipping duplicates (by hash).
    Returns (inserted_count, skipped_count).
    """
    inserted = 0
    skipped = 0

    UPSERT_SQL = """
    INSERT INTO tradie_leads (
        lead_hash, business_name, trade, city, state, area_code,
        suburb, phone, email, website, rating, review_count,
        listing_url, priority_score
    ) VALUES %s
    ON CONFLICT (lead_hash) DO UPDATE SET
        rating        = EXCLUDED.rating,
        review_count  = EXCLUDED.review_count,
        email         = CASE WHEN EXCLUDED.email != '' THEN EXCLUDED.email ELSE tradie_leads.email END,
        website       = CASE WHEN EXCLUDED.website != '' THEN EXCLUDED.website ELSE tradie_leads.website END,
        suburb        = CASE WHEN EXCLUDED.suburb != '' THEN EXCLUDED.suburb ELSE tradie_leads.suburb END,
        priority_score = EXCLUDED.priority_score,
        updated_at    = NOW()
    RETURNING (xmax = 0) AS is_insert
    """

    rows = []
    for lead in leads:
        if not lead.get('business_name') or not lead.get('phone'):
            continue
        lead_hash = compute_lead_hash(lead['business_name'], lead.get('phone', ''))
        priority = compute_priority_score(
            lead.get('rating'),
            lead.get('review_count', 0),
            bool(lead.get('email', ''))
        )
        rows.append((
            lead_hash,
            lead['business_name'],
            lead.get('trade', ''),
            lead.get('city', ''),
            lead.get('state', ''),
            lead.get('area_code', ''),
            lead.get('suburb', ''),
            lead.get('phone', ''),
            lead.get('email', ''),
            lead.get('website', ''),
            lead.get('rating'),
            lead.get('review_count', 0),
            lead.get('listing_url', ''),
            priority,
        ))

    if not rows:
        return 0, 0

    try:
        with conn.cursor() as cur:
            results = execute_values(cur, UPSERT_SQL, rows, fetch=True)
            for (is_insert,) in results:
                if is_insert:
                    inserted += 1
                else:
                    skipped += 1
        conn.commit()
    except Exception as e:
        conn.rollback()
        log.error(f"DB upsert error: {e}")

    return inserted, skipped


# ─── Main Orchestration ────────────────────────────────────────────────────────

def run_scrape(city_filter: Optional[str] = None, trade_filter: Optional[str] = None):
    """Main entry point: scrape targets and persist to PostgreSQL."""
    log.info("=" * 60)
    log.info("ReceptionistAI Tradie Lead Scraper v2 — STARTING")
    log.info(f"Date: {date.today()}")
    log.info("=" * 60)

    # Ensure logs dir exists
    os.makedirs(r'E:\genesis-system\logs', exist_ok=True)

    # Setup DB
    setup_database()

    cities = TARGETS['cities']
    trades = TARGETS['trades']

    if city_filter:
        cities = [c for c in cities if c['name'].lower() == city_filter.lower()]
        if not cities:
            log.error(f"Unknown city: {city_filter}")
            return

    if trade_filter:
        trades = [t for t in trades if t['name'].lower() == trade_filter.lower()]
        if not trades:
            log.error(f"Unknown trade: {trade_filter}")
            return

    session = requests.Session()
    session.headers.update(HEADERS)

    total_inserted = 0
    total_skipped = 0
    total_scraped = 0

    conn = get_db_connection()
    try:
        for city in cities:
            for trade in trades:
                try:
                    leads = scrape_city_trade(city, trade, session)
                    total_scraped += len(leads)
                    ins, skp = upsert_leads(leads, conn)
                    total_inserted += ins
                    total_skipped += skp
                    log.info(
                        f"  {city['name']} / {trade['name']}: "
                        f"scraped={len(leads)}, inserted={ins}, updated={skp}"
                    )
                except Exception as e:
                    log.error(f"Error scraping {city['name']}/{trade['name']}: {e}")

                # Delay between city/trade combos
                time.sleep(random.uniform(3, 8))
    finally:
        conn.close()

    log.info("=" * 60)
    log.info(f"COMPLETE: scraped={total_scraped}, inserted={total_inserted}, updated={total_skipped}")
    log.info("=" * 60)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ReceptionistAI Tradie Lead Scraper v2')
    parser.add_argument('--city',     help='Filter to single city (e.g. Brisbane)')
    parser.add_argument('--trade',    help='Filter to single trade (e.g. plumber)')
    parser.add_argument('--setup-db', action='store_true', help='Create DB table only')
    args = parser.parse_args()

    if args.setup_db:
        setup_database()
        print("Database setup complete.")
    else:
        run_scrape(city_filter=args.city, trade_filter=args.trade)
