#!/usr/bin/env python3
"""
LEAD ENRICHER — Genesis ReceptionistAI Pipeline
For each tradie lead in PostgreSQL:
  - Checks if they have a working website
  - Scores website quality (0-10)
  - Estimates revenue potential
  - Calculates priority score (high = no website + many reviews + phone)
  - Updates tradie_leads table with enrichment columns

Usage:
    python lead_enricher.py [--limit N] [--unenriched-only]
"""

import sys
import re
import time
import random
import argparse
import logging
from datetime import datetime
from urllib.parse import urlparse

sys.path.append('E:/genesis-system/data/genesis-memory')

# Optional: requests for website checking
try:
    import requests
    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False

logging.basicConfig(
    filename='E:/genesis-system/data/LEADS/lead_enricher.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s'
)

# ─── DB ─────────────────────────────────────────────────────────────────────────

def get_db_conn():
    from elestio_config import PostgresConfig
    import psycopg2
    return psycopg2.connect(**PostgresConfig.get_connection_params())

def ensure_enrichment_columns(conn):
    """Add enrichment columns if they don't exist."""
    columns = [
        ("has_website",     "BOOLEAN"),
        ("website_score",   "INTEGER"),
        ("priority_score",  "FLOAT"),
        ("estimated_revenue_band", "TEXT"),
        ("enriched_at",     "TIMESTAMP"),
        ("outreach_status", "TEXT DEFAULT 'not_contacted'"),
    ]
    cur = conn.cursor()
    for col_name, col_type in columns:
        try:
            cur.execute(f"ALTER TABLE tradie_leads ADD COLUMN IF NOT EXISTS {col_name} {col_type}")
        except Exception as e:
            logging.debug(f"Column {col_name} may exist: {e}")
            conn.rollback()
    conn.commit()
    logging.info("Enrichment columns ready")

# ─── WEBSITE CHECKS ─────────────────────────────────────────────────────────────

def check_website(url: str) -> dict:
    """
    Check if website is alive and score its quality.
    Returns dict: {alive, status_code, score, signals}
    Score 0-10:
      +3 if website loads (200)
      +2 if has phone number on page
      +2 if has contact form / 'contact' link
      +1 if has Google Analytics / tag manager
      +1 if SSL (https)
      -2 if loads to generic parking page
      -1 if very slow (>5s)
    """
    result = {
        'alive': False,
        'status_code': None,
        'score': 0,
        'signals': [],
        'load_time_s': None,
    }

    if not url or not HAS_REQUESTS:
        return result

    # Normalise URL
    if not url.startswith('http'):
        url = 'https://' + url

    try:
        start = time.time()
        resp = requests.get(
            url, timeout=8,
            headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/121.0'},
            allow_redirects=True
        )
        load_time = time.time() - start
        result['load_time_s'] = round(load_time, 2)
        result['status_code'] = resp.status_code
        result['alive'] = resp.status_code < 400

        if not result['alive']:
            return result

        html = resp.text.lower()
        score = 0

        # Base: website loads
        score += 3
        result['signals'].append('website_live')

        # SSL
        if url.startswith('https://') or resp.url.startswith('https://'):
            score += 1
            result['signals'].append('ssl')

        # Phone number on page
        if re.search(r'(\+61|0[4-9])\s*[\d\s\-]{7,12}', html):
            score += 2
            result['signals'].append('phone_on_page')

        # Contact link or form
        if 'contact' in html or '<form' in html:
            score += 1
            result['signals'].append('contact_present')

        # Google Analytics / GTM
        if 'google-analytics' in html or 'googletagmanager' in html or 'gtag' in html:
            score += 1
            result['signals'].append('analytics')

        # Parking / placeholder pages (negative signals)
        parking_signals = ['this domain is for sale', 'domain parking', 'coming soon',
                           'under construction', 'godaddy', 'namecheap parking']
        if any(p in html for p in parking_signals):
            score -= 2
            result['signals'].append('parking_page')

        # Slow load
        if load_time > 5:
            score -= 1
            result['signals'].append('slow_load')

        result['score'] = max(0, min(10, score))

    except requests.exceptions.SSLError:
        # Try HTTP fallback
        try:
            http_url = url.replace('https://', 'http://')
            resp = requests.get(http_url, timeout=6, allow_redirects=True,
                                headers={'User-Agent': 'Mozilla/5.0'})
            result['status_code'] = resp.status_code
            result['alive'] = resp.status_code < 400
            if result['alive']:
                result['score'] = 2  # Low score due to SSL issues
                result['signals'].append('no_ssl')
        except Exception:
            pass
    except Exception as e:
        logging.debug(f"Website check failed for {url}: {e}")

    return result

# ─── SCORING ────────────────────────────────────────────────────────────────────

def calculate_priority_score(lead: dict, website_check: dict) -> tuple:
    """
    Calculate outreach priority score (0-100) and revenue band.
    High priority = tradies with NO website + many reviews + phone visible.
    These are exactly who needs ReceptionistAI most.

    Score breakdown:
      - No website:             +40 pts  (biggest pain point)
      - Has website but bad:    +20 pts  (needs upgrade)
      - Has website, good:      +5 pts   (still a lead, lower priority)
      - Reviews >= 20:          +20 pts
      - Reviews 10-19:          +15 pts
      - Reviews 1-9:            +8 pts
      - Rating >= 4.5:          +10 pts
      - Rating >= 4.0:          +5 pts
      - Has phone:              +10 pts  (can call them)
      - Cairns/regional:        +5 pts   (less competition)
    """
    score = 0
    signals = []

    # Website factor
    if not website_check.get('alive'):
        if not lead.get('website'):
            score += 40
            signals.append('NO_WEBSITE')
        else:
            score += 25  # Has URL but dead = urgent fix needed
            signals.append('DEAD_WEBSITE')
    else:
        ws = website_check.get('score', 0)
        if ws <= 3:
            score += 20
            signals.append('POOR_WEBSITE')
        elif ws <= 6:
            score += 10
            signals.append('AVERAGE_WEBSITE')
        else:
            score += 5
            signals.append('GOOD_WEBSITE')

    # Review count
    review_count = 0
    try:
        review_count = int(lead.get('review_count') or 0)
    except (ValueError, TypeError):
        pass

    if review_count >= 20:
        score += 20
        signals.append(f'HIGH_REVIEWS_{review_count}')
    elif review_count >= 10:
        score += 15
        signals.append(f'MED_REVIEWS_{review_count}')
    elif review_count >= 1:
        score += 8
        signals.append(f'LOW_REVIEWS_{review_count}')

    # Rating
    try:
        rating = float(lead.get('rating') or 0)
        if rating >= 4.5:
            score += 10
            signals.append(f'HIGH_RATING_{rating}')
        elif rating >= 4.0:
            score += 5
    except (ValueError, TypeError):
        pass

    # Phone present
    if lead.get('phone'):
        score += 10
        signals.append('HAS_PHONE')

    # Regional bonus (less competition from digital agencies)
    regional_cities = ['cairns', 'townsville', 'toowoomba', 'bundaberg', 'mackay',
                       'rockhampton', 'hervey-bay', 'gladstone', 'geelong', 'ballarat']
    suburb = (lead.get('suburb') or '').lower()
    city = (lead.get('source_url') or '').lower()
    if any(r in suburb or r in city for r in regional_cities):
        score += 5
        signals.append('REGIONAL')

    score = min(100, score)

    # Revenue band estimation
    if review_count >= 50:
        rev_band = 'HIGH'        # Established business, $500K+/yr
    elif review_count >= 15:
        rev_band = 'MEDIUM'      # Growing business, $200-500K/yr
    elif review_count >= 5:
        rev_band = 'EMERGING'    # $50-200K/yr
    else:
        rev_band = 'UNKNOWN'

    return score, rev_band, signals


# ─── MAIN ───────────────────────────────────────────────────────────────────────

def enrich_leads(limit: int = None, unenriched_only: bool = True):
    conn = get_db_conn()
    ensure_enrichment_columns(conn)
    cur = conn.cursor()

    # Fetch leads to enrich
    if unenriched_only:
        query = """
            SELECT id, business_name, trade_category, phone, website,
                   suburb, state, rating, review_count, source_url
            FROM tradie_leads
            WHERE enriched_at IS NULL
            ORDER BY id
        """
    else:
        query = """
            SELECT id, business_name, trade_category, phone, website,
                   suburb, state, rating, review_count, source_url
            FROM tradie_leads
            ORDER BY id
        """

    if limit:
        query += f" LIMIT {limit}"

    cur.execute(query)
    leads = cur.fetchall()
    cols = ['id', 'business_name', 'trade_category', 'phone', 'website',
            'suburb', 'state', 'rating', 'review_count', 'source_url']

    print(f"Enriching {len(leads)} leads...")
    enriched = 0
    errors = 0

    for row in leads:
        lead = dict(zip(cols, row))
        lead_id = lead['id']

        try:
            # Website check
            ws_result = check_website(lead.get('website', ''))
            has_website = ws_result['alive']
            website_score = ws_result['score']

            # Priority scoring
            priority, rev_band, signals = calculate_priority_score(lead, ws_result)

            # Update DB
            cur.execute("""
                UPDATE tradie_leads SET
                    has_website = %s,
                    website_score = %s,
                    priority_score = %s,
                    estimated_revenue_band = %s,
                    enriched_at = %s
                WHERE id = %s
            """, (has_website, website_score, priority, rev_band,
                  datetime.now(), lead_id))
            conn.commit()
            enriched += 1

            if enriched % 10 == 0:
                print(f"  Enriched {enriched}/{len(leads)} | "
                      f"Last: {lead['business_name'][:30]} | "
                      f"Score: {priority} | Signals: {signals}")

            # Polite delay
            time.sleep(random.uniform(0.3, 0.8))

        except Exception as e:
            conn.rollback()
            errors += 1
            logging.error(f"Enrichment error for lead {lead_id}: {e}")

    conn.close()
    print(f"\nEnrichment complete: {enriched} enriched, {errors} errors")
    return enriched


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Lead Enricher')
    parser.add_argument('--limit', type=int, default=None,
                        help='Max leads to enrich')
    parser.add_argument('--all', action='store_true',
                        help='Re-enrich already enriched leads')
    args = parser.parse_args()
    enrich_leads(limit=args.limit, unenriched_only=not args.all)
