"""
Genesis Discovery Intelligence Extractor
=========================================
Extracts business intelligence from AgileAdapt discovery call transcripts
using Gemini API and stores results in PostgreSQL.

Features:
- Gemini-powered extraction with structured prompts
- Business metadata extraction (name, type, location, size)
- Pain point identification and categorization
- Current tools/software detection
- Budget signal analysis
- Lead scoring algorithm (hot/warm/cold)
- PostgreSQL storage (discovery_intelligence table)

Usage:
    from core.discovery.intelligence_extractor import IntelligenceExtractor

    extractor = IntelligenceExtractor()

    # Extract from transcript
    intel = extractor.extract_from_transcript(transcript_id=123)

    # Score a lead
    score = extractor.score_lead(intel)

IMPORTANT: NO SQLITE - Uses PostgreSQL (Elestio) only per Genesis rules.

VERIFICATION_STAMP
Story: STORY-1.2
Verified By: Claude Agent
Verified At: 2026-01-24
Tests: See test_intelligence_extractor.py
Coverage: Black box + White box tests included
"""

import os
import sys
import json
import hashlib
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional, Tuple, Union
from dataclasses import dataclass, asdict, field
from enum import Enum
import logging

# Add genesis-memory to path for Elestio config
sys.path.insert(0, '/mnt/e/genesis-system/data/genesis-memory')

# Elestio infrastructure imports
try:
    from elestio_config import PostgresConfig
except ImportError:
    from dataclasses import dataclass as fallback_dataclass

    @fallback_dataclass
    class PostgresConfig:
        """Fallback PostgreSQL configuration."""
        host: str = "postgresql-genesis-u50607.vm.elestio.app"
        port: int = 25432
        user: str = "postgres"
        password: str = "etY0eog17tD-dDuj--IRH"
        database: str = "postgres"

        @classmethod
        def get_connection_params(cls) -> dict:
            config = cls()
            return {
                "host": config.host,
                "port": config.port,
                "user": config.user,
                "password": config.password,
                "database": config.database
            }

# Database imports
import psycopg2
from psycopg2 import pool
from psycopg2.extras import RealDictCursor

# Gemini imports
try:
    import google.generativeai as genai
    GENAI_AVAILABLE = True
except ImportError:
    GENAI_AVAILABLE = False

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# =============================================================================
# ENUMS AND DATA CLASSES
# =============================================================================

class LeadScore(Enum):
    """Lead qualification score."""
    HOT = "hot"      # High intent, budget confirmed, urgent timeline
    WARM = "warm"    # Good fit, moderate interest, timeline unclear
    COLD = "cold"    # Low interest, no budget signal, or poor fit


class PainPointCategory(Enum):
    """Categories for pain points."""
    EFFICIENCY = "efficiency"          # Time-wasting, manual processes
    COST = "cost"                      # Spending too much, wasted resources
    GROWTH = "growth"                  # Can't scale, missing opportunities
    QUALITY = "quality"                # Errors, inconsistency, complaints
    INTEGRATION = "integration"        # Systems don't talk, data silos
    COMMUNICATION = "communication"    # Missed calls, slow response
    STAFFING = "staffing"              # Can't find staff, training issues
    TECHNOLOGY = "technology"          # Outdated systems, lack of automation
    OTHER = "other"


@dataclass
class PainPoint:
    """A specific pain point mentioned in discovery."""
    description: str
    category: PainPointCategory
    severity: int  # 1-5, with 5 being most severe
    quote: Optional[str] = None  # Direct quote from transcript

    def to_dict(self) -> Dict:
        return {
            "description": self.description,
            "category": self.category.value,
            "severity": self.severity,
            "quote": self.quote
        }

    @classmethod
    def from_dict(cls, data: Dict) -> 'PainPoint':
        return cls(
            description=data.get("description", ""),
            category=PainPointCategory(data.get("category", "other")),
            severity=data.get("severity", 1),
            quote=data.get("quote")
        )


@dataclass
class BudgetSignal:
    """A budget or pricing signal detected in discovery."""
    signal_type: str  # "explicit_budget", "roi_focus", "price_sensitive", "value_focused"
    description: str
    amount: Optional[float] = None  # If specific amount mentioned
    quote: Optional[str] = None

    def to_dict(self) -> Dict:
        return {
            "signal_type": self.signal_type,
            "description": self.description,
            "amount": self.amount,
            "quote": self.quote
        }

    @classmethod
    def from_dict(cls, data: Dict) -> 'BudgetSignal':
        return cls(
            signal_type=data.get("signal_type", "unknown"),
            description=data.get("description", ""),
            amount=data.get("amount"),
            quote=data.get("quote")
        )


@dataclass
class CurrentTool:
    """A tool or system currently used by the prospect."""
    name: str
    category: str  # CRM, scheduling, accounting, etc.
    satisfaction: Optional[str] = None  # satisfied, neutral, dissatisfied
    notes: Optional[str] = None

    def to_dict(self) -> Dict:
        return {
            "name": self.name,
            "category": self.category,
            "satisfaction": self.satisfaction,
            "notes": self.notes
        }

    @classmethod
    def from_dict(cls, data: Dict) -> 'CurrentTool':
        return cls(
            name=data.get("name", ""),
            category=data.get("category", "other"),
            satisfaction=data.get("satisfaction"),
            notes=data.get("notes")
        )


@dataclass
class DiscoveryIntelligence:
    """
    Complete intelligence extracted from a discovery call.

    This is the main data class containing all extracted business intelligence
    ready for storage in PostgreSQL.
    """
    # Core identifiers
    id: Optional[int] = None
    transcript_id: Optional[int] = None

    # Business metadata
    business_name: Optional[str] = None
    business_type: Optional[str] = None  # e.g., "plumbing", "electrical", "HVAC"
    location: Optional[str] = None  # City, state, or region
    business_size: Optional[str] = None  # "solo", "small" (2-10), "medium" (11-50), "large" (50+)
    employee_count: Optional[int] = None
    annual_revenue_estimate: Optional[str] = None

    # Pain points (detailed)
    pain_points: List[PainPoint] = field(default_factory=list)

    # Current technology stack
    current_tools: List[CurrentTool] = field(default_factory=list)

    # Budget signals
    budget_signals: List[BudgetSignal] = field(default_factory=list)

    # Lead scoring
    lead_score: LeadScore = LeadScore.COLD
    lead_score_reasons: List[str] = field(default_factory=list)

    # Summary and metadata
    summary: Optional[str] = None
    key_quotes: List[str] = field(default_factory=list)
    next_steps: List[str] = field(default_factory=list)
    urgency_level: int = 1  # 1-5
    decision_timeline: Optional[str] = None

    # Extraction metadata
    extracted_at: Optional[datetime] = None
    extraction_confidence: float = 0.0
    raw_extraction: Optional[Dict] = None

    def to_dict(self) -> Dict:
        """Convert to dictionary for JSON storage."""
        return {
            "id": self.id,
            "transcript_id": self.transcript_id,
            "business_name": self.business_name,
            "business_type": self.business_type,
            "location": self.location,
            "business_size": self.business_size,
            "employee_count": self.employee_count,
            "annual_revenue_estimate": self.annual_revenue_estimate,
            "pain_points": [pp.to_dict() for pp in self.pain_points],
            "current_tools": [ct.to_dict() for ct in self.current_tools],
            "budget_signals": [bs.to_dict() for bs in self.budget_signals],
            "lead_score": self.lead_score.value,
            "lead_score_reasons": self.lead_score_reasons,
            "summary": self.summary,
            "key_quotes": self.key_quotes,
            "next_steps": self.next_steps,
            "urgency_level": self.urgency_level,
            "decision_timeline": self.decision_timeline,
            "extracted_at": self.extracted_at.isoformat() if self.extracted_at else None,
            "extraction_confidence": self.extraction_confidence,
            "raw_extraction": self.raw_extraction
        }

    @classmethod
    def from_dict(cls, data: Dict) -> 'DiscoveryIntelligence':
        """Create from dictionary."""
        pain_points = [PainPoint.from_dict(pp) for pp in data.get("pain_points", [])]
        current_tools = [CurrentTool.from_dict(ct) for ct in data.get("current_tools", [])]
        budget_signals = [BudgetSignal.from_dict(bs) for bs in data.get("budget_signals", [])]

        lead_score_str = data.get("lead_score", "cold")
        lead_score = LeadScore(lead_score_str) if lead_score_str else LeadScore.COLD

        extracted_at = data.get("extracted_at")
        if isinstance(extracted_at, str):
            extracted_at = datetime.fromisoformat(extracted_at)

        return cls(
            id=data.get("id"),
            transcript_id=data.get("transcript_id"),
            business_name=data.get("business_name"),
            business_type=data.get("business_type"),
            location=data.get("location"),
            business_size=data.get("business_size"),
            employee_count=data.get("employee_count"),
            annual_revenue_estimate=data.get("annual_revenue_estimate"),
            pain_points=pain_points,
            current_tools=current_tools,
            budget_signals=budget_signals,
            lead_score=lead_score,
            lead_score_reasons=data.get("lead_score_reasons", []),
            summary=data.get("summary"),
            key_quotes=data.get("key_quotes", []),
            next_steps=data.get("next_steps", []),
            urgency_level=data.get("urgency_level", 1),
            decision_timeline=data.get("decision_timeline"),
            extracted_at=extracted_at,
            extraction_confidence=data.get("extraction_confidence", 0.0),
            raw_extraction=data.get("raw_extraction")
        )


# =============================================================================
# DATABASE SCHEMA
# =============================================================================

INTELLIGENCE_SCHEMA_SQL = """
-- Discovery Intelligence Table
-- Stores extracted business intelligence from discovery call transcripts
CREATE TABLE IF NOT EXISTS discovery_intelligence (
    id SERIAL PRIMARY KEY,
    transcript_id INTEGER REFERENCES discovery_transcripts(transcript_id) ON DELETE CASCADE,

    -- Business metadata
    business_name TEXT,
    business_type TEXT,
    location TEXT,
    business_size TEXT,
    employee_count INTEGER,
    annual_revenue_estimate TEXT,

    -- Extracted data (JSONB for flexibility)
    pain_points JSONB DEFAULT '[]',
    current_tools JSONB DEFAULT '[]',
    budget_signals JSONB DEFAULT '[]',

    -- Lead scoring
    lead_score TEXT DEFAULT 'cold',  -- hot/warm/cold
    lead_score_reasons JSONB DEFAULT '[]',

    -- Summary and metadata
    summary TEXT,
    key_quotes JSONB DEFAULT '[]',
    next_steps JSONB DEFAULT '[]',
    urgency_level INTEGER DEFAULT 1,
    decision_timeline TEXT,

    -- Extraction metadata
    extraction_confidence FLOAT DEFAULT 0.0,
    raw_extraction JSONB,
    extracted_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),

    -- Ensure one intelligence record per transcript
    CONSTRAINT unique_transcript_intel UNIQUE (transcript_id)
);

-- Indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_intel_lead_score ON discovery_intelligence(lead_score);
CREATE INDEX IF NOT EXISTS idx_intel_business_type ON discovery_intelligence(business_type);
CREATE INDEX IF NOT EXISTS idx_intel_location ON discovery_intelligence(location);
CREATE INDEX IF NOT EXISTS idx_intel_urgency ON discovery_intelligence(urgency_level DESC);
CREATE INDEX IF NOT EXISTS idx_intel_extracted ON discovery_intelligence(extracted_at DESC);
CREATE INDEX IF NOT EXISTS idx_intel_pain_points ON discovery_intelligence USING GIN (pain_points);
CREATE INDEX IF NOT EXISTS idx_intel_budget_signals ON discovery_intelligence USING GIN (budget_signals);
"""


# =============================================================================
# EXTRACTION PROMPTS
# =============================================================================

INTELLIGENCE_EXTRACTION_PROMPT = """You are an expert business analyst extracting intelligence from a discovery call transcript for AgileAdapt, an AI Voice Agent company targeting Australian tradies and service businesses.

TRANSCRIPT:
{transcript}

Analyze this transcript and extract detailed business intelligence. Be thorough and accurate.
Extract ONLY information that is explicitly stated or strongly implied in the transcript.

Return a JSON object with this exact structure:

{{
    "business_name": "Name of the business (string or null if not mentioned)",
    "business_type": "Type of trade/service (e.g., 'plumbing', 'electrical', 'HVAC', 'building', 'landscaping', null if unknown)",
    "location": "City, suburb, or region mentioned (string or null)",
    "business_size": "One of: 'solo', 'small' (2-10 employees), 'medium' (11-50), 'large' (50+), or null",
    "employee_count": number or null,
    "annual_revenue_estimate": "Revenue range if mentioned (string or null)",

    "pain_points": [
        {{
            "description": "Clear description of the pain point",
            "category": "One of: efficiency, cost, growth, quality, integration, communication, staffing, technology, other",
            "severity": 1-5 (5 being most severe based on how much emphasis was placed on it),
            "quote": "Direct quote from transcript if available, or null"
        }}
    ],

    "current_tools": [
        {{
            "name": "Name of the tool/software/system",
            "category": "Type of tool (CRM, scheduling, accounting, phone system, etc.)",
            "satisfaction": "One of: satisfied, neutral, dissatisfied, or null",
            "notes": "Any relevant notes about their use of this tool"
        }}
    ],

    "budget_signals": [
        {{
            "signal_type": "One of: explicit_budget, roi_focus, price_sensitive, value_focused, comparison_shopping, investment_ready",
            "description": "What was said or implied about budget",
            "amount": number or null (if specific amount mentioned),
            "quote": "Direct quote if available"
        }}
    ],

    "urgency_level": 1-5 (1=no urgency, 5=extremely urgent, based on timeline and language used),
    "decision_timeline": "When they need/want to make a decision (string or null)",

    "key_quotes": [
        "Important direct quotes that reveal buying intent, pain, or requirements"
    ],

    "next_steps": [
        "Any agreed or suggested next steps from the call"
    ],

    "summary": "2-3 sentence summary of this prospect's situation and potential fit",

    "extraction_confidence": 0.0-1.0 (how confident you are in the overall extraction quality)
}}

IMPORTANT GUIDELINES:
1. Pain points should be SPECIFIC problems, not generic statements
2. For budget signals, listen for: pricing discussions, ROI questions, competitor pricing mentions, willingness to invest
3. Urgency indicators: "ASAP", "yesterday", specific dates, busy season mentions, lost jobs
4. Business size clues: number of vehicles, job volume per week, employee mentions
5. Be conservative with confidence - lower it if the transcript is unclear or short

Return ONLY valid JSON, no other text."""


LEAD_SCORING_PROMPT = """Based on this extracted intelligence, provide a lead score analysis.

INTELLIGENCE:
{intelligence_json}

Score this lead as HOT, WARM, or COLD based on these criteria:

HOT Lead Criteria (needs 3+ of these):
- Explicit budget mentioned or confirmed
- Urgent timeline (within 30 days)
- Decision maker on the call
- Multiple high-severity pain points
- Dissatisfied with current solution
- Asked specific implementation questions
- Requested pricing or proposal

WARM Lead Criteria:
- Some pain points identified
- General interest expressed
- Timeline within 90 days
- At least one budget signal
- Fits target customer profile (Australian tradie/service business)

COLD Lead Criteria:
- No clear pain points
- No budget signals
- No timeline
- Just gathering information
- Poor fit for AI voice agents

Return a JSON object:
{{
    "lead_score": "hot", "warm", or "cold",
    "reasons": [
        "List specific reasons for this score based on the extracted data"
    ],
    "confidence": 0.0-1.0
}}

Return ONLY valid JSON."""


# =============================================================================
# INTELLIGENCE EXTRACTOR CLASS
# =============================================================================

class IntelligenceExtractor:
    """
    Extracts and scores business intelligence from discovery transcripts.

    Uses Gemini API for intelligent extraction and stores results in PostgreSQL.

    Attributes:
        db_pool: PostgreSQL connection pool
        gemini_model: Gemini model name to use
    """

    DEFAULT_MODEL = "gemini-2.0-flash"

    def __init__(
        self,
        gemini_api_key: Optional[str] = None,
        gemini_model: str = DEFAULT_MODEL,
        init_schema: bool = True
    ):
        """
        Initialize the intelligence extractor.

        Args:
            gemini_api_key: Gemini API key (falls back to env/file)
            gemini_model: Gemini model to use for extraction
            init_schema: Whether to initialize the database schema
        """
        self.gemini_model = gemini_model

        # Load Gemini API key
        self.api_key = gemini_api_key or self._load_gemini_key()
        if GENAI_AVAILABLE and self.api_key:
            genai.configure(api_key=self.api_key)
            logger.info(f"Gemini configured with model: {gemini_model}")
        else:
            logger.warning("Gemini API not available - extraction will fail")

        # Initialize database
        self.db_pool = self._init_database(init_schema)

        logger.info("IntelligenceExtractor initialized")

    def _load_gemini_key(self) -> Optional[str]:
        """Load Gemini API key from environment or file."""
        key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
        if key:
            return key

        key_file = Path("/mnt/e/genesis-system/Credentials/GoogleAIStudio-Gemini-AgileAdapt-API-KEY.txt")
        if key_file.exists():
            content = key_file.read_text().strip()
            if "=" in content:
                return content.split("=", 1)[1].strip()
            return content

        return None

    def _init_database(self, init_schema: bool = True) -> pool.ThreadedConnectionPool:
        """Initialize PostgreSQL connection pool and schema."""
        try:
            db_pool = pool.ThreadedConnectionPool(
                2, 10,
                **PostgresConfig.get_connection_params()
            )

            if init_schema:
                conn = db_pool.getconn()
                try:
                    with conn.cursor() as cur:
                        cur.execute(INTELLIGENCE_SCHEMA_SQL)
                    conn.commit()
                    logger.info("Intelligence schema initialized")
                finally:
                    db_pool.putconn(conn)

            return db_pool

        except Exception as e:
            logger.error(f"Database initialization failed: {e}")
            raise

    # =========================================================================
    # EXTRACTION METHODS
    # =========================================================================

    def extract_from_transcript(
        self,
        transcript_id: int,
        transcript_text: Optional[str] = None
    ) -> DiscoveryIntelligence:
        """
        Extract intelligence from a transcript.

        Args:
            transcript_id: ID of the transcript in discovery_transcripts table
            transcript_text: Optional transcript text (fetched from DB if not provided)

        Returns:
            DiscoveryIntelligence with extracted data and lead score
        """
        # Fetch transcript if not provided
        if transcript_text is None:
            transcript_text = self._fetch_transcript(transcript_id)
            if transcript_text is None:
                raise ValueError(f"Transcript {transcript_id} not found")

        # Extract intelligence using Gemini
        raw_extraction = self._extract_with_gemini(transcript_text)

        # Parse extraction into structured data
        intel = self._parse_extraction(raw_extraction, transcript_id)

        # Score the lead
        intel = self._score_lead(intel)

        # Store in database
        intel.id = self._store_intelligence(intel)

        return intel

    def extract_from_text(
        self,
        transcript_text: str,
        transcript_id: Optional[int] = None
    ) -> DiscoveryIntelligence:
        """
        Extract intelligence from raw transcript text.

        Args:
            transcript_text: The transcript text to analyze
            transcript_id: Optional transcript ID for linking

        Returns:
            DiscoveryIntelligence with extracted data and lead score
        """
        # Extract intelligence using Gemini
        raw_extraction = self._extract_with_gemini(transcript_text)

        # Parse extraction into structured data
        intel = self._parse_extraction(raw_extraction, transcript_id)

        # Score the lead
        intel = self._score_lead(intel)

        return intel

    def _fetch_transcript(self, transcript_id: int) -> Optional[str]:
        """Fetch transcript text from database."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute(
                    "SELECT transcript_text FROM discovery_transcripts WHERE transcript_id = %s",
                    (transcript_id,)
                )
                row = cur.fetchone()
                return row['transcript_text'] if row else None
        finally:
            self.db_pool.putconn(conn)

    def _extract_with_gemini(self, transcript_text: str) -> Dict:
        """
        Use Gemini to extract intelligence from transcript.

        Args:
            transcript_text: The transcript to analyze

        Returns:
            Dictionary with extracted data
        """
        if not GENAI_AVAILABLE or not self.api_key:
            raise RuntimeError("Gemini API not available")

        # Build prompt
        prompt = INTELLIGENCE_EXTRACTION_PROMPT.format(transcript=transcript_text)

        try:
            model = genai.GenerativeModel(self.gemini_model)
            response = model.generate_content(
                prompt,
                generation_config=genai.GenerationConfig(
                    temperature=0.2,  # Lower temperature for more consistent extraction
                    max_output_tokens=4096
                )
            )

            # Parse JSON response
            response_text = response.text.strip()

            # Handle markdown code blocks
            if response_text.startswith("```"):
                lines = response_text.split("\n")
                # Remove first and last lines (```json and ```)
                response_text = "\n".join(lines[1:-1])

            return json.loads(response_text)

        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse Gemini response as JSON: {e}")
            logger.debug(f"Response was: {response_text[:500]}")
            return {}
        except Exception as e:
            logger.error(f"Gemini extraction failed: {e}")
            return {}

    def _parse_extraction(
        self,
        raw_extraction: Dict,
        transcript_id: Optional[int]
    ) -> DiscoveryIntelligence:
        """Parse raw Gemini extraction into DiscoveryIntelligence."""

        # Parse pain points
        pain_points = []
        for pp_data in raw_extraction.get("pain_points", []):
            try:
                category_str = pp_data.get("category", "other").lower()
                try:
                    category = PainPointCategory(category_str)
                except ValueError:
                    category = PainPointCategory.OTHER

                pain_points.append(PainPoint(
                    description=pp_data.get("description", ""),
                    category=category,
                    severity=min(5, max(1, pp_data.get("severity", 1))),
                    quote=pp_data.get("quote")
                ))
            except Exception as e:
                logger.warning(f"Failed to parse pain point: {e}")

        # Parse current tools
        current_tools = []
        for ct_data in raw_extraction.get("current_tools", []):
            try:
                current_tools.append(CurrentTool(
                    name=ct_data.get("name", ""),
                    category=ct_data.get("category", "other"),
                    satisfaction=ct_data.get("satisfaction"),
                    notes=ct_data.get("notes")
                ))
            except Exception as e:
                logger.warning(f"Failed to parse current tool: {e}")

        # Parse budget signals
        budget_signals = []
        for bs_data in raw_extraction.get("budget_signals", []):
            try:
                budget_signals.append(BudgetSignal(
                    signal_type=bs_data.get("signal_type", "unknown"),
                    description=bs_data.get("description", ""),
                    amount=bs_data.get("amount"),
                    quote=bs_data.get("quote")
                ))
            except Exception as e:
                logger.warning(f"Failed to parse budget signal: {e}")

        return DiscoveryIntelligence(
            transcript_id=transcript_id,
            business_name=raw_extraction.get("business_name"),
            business_type=raw_extraction.get("business_type"),
            location=raw_extraction.get("location"),
            business_size=raw_extraction.get("business_size"),
            employee_count=raw_extraction.get("employee_count"),
            annual_revenue_estimate=raw_extraction.get("annual_revenue_estimate"),
            pain_points=pain_points,
            current_tools=current_tools,
            budget_signals=budget_signals,
            summary=raw_extraction.get("summary"),
            key_quotes=raw_extraction.get("key_quotes", []),
            next_steps=raw_extraction.get("next_steps", []),
            urgency_level=min(5, max(1, raw_extraction.get("urgency_level", 1))),
            decision_timeline=raw_extraction.get("decision_timeline"),
            extracted_at=datetime.now(timezone.utc),
            extraction_confidence=raw_extraction.get("extraction_confidence", 0.0),
            raw_extraction=raw_extraction
        )

    # =========================================================================
    # LEAD SCORING
    # =========================================================================

    def _score_lead(self, intel: DiscoveryIntelligence) -> DiscoveryIntelligence:
        """
        Score a lead based on extracted intelligence.

        Uses a weighted scoring algorithm considering:
        - Pain point severity and count
        - Budget signals
        - Urgency level
        - Business fit (type, size, location)
        - Tool dissatisfaction

        Args:
            intel: DiscoveryIntelligence to score

        Returns:
            Updated DiscoveryIntelligence with lead score
        """
        score_points = 0
        reasons = []

        # 1. Pain Points Analysis (max 30 points)
        if intel.pain_points:
            pain_point_count = len(intel.pain_points)
            avg_severity = sum(pp.severity for pp in intel.pain_points) / pain_point_count

            if pain_point_count >= 3 and avg_severity >= 3.5:
                score_points += 30
                reasons.append(f"Multiple high-severity pain points ({pain_point_count} pain points, avg severity {avg_severity:.1f})")
            elif pain_point_count >= 2:
                score_points += 20
                reasons.append(f"Multiple pain points identified ({pain_point_count})")
            elif pain_point_count >= 1:
                score_points += 10
                reasons.append("At least one pain point identified")

            # Check for communication/efficiency pain points (key for AI voice agents)
            key_categories = {PainPointCategory.COMMUNICATION, PainPointCategory.EFFICIENCY, PainPointCategory.STAFFING}
            key_pain_points = [pp for pp in intel.pain_points if pp.category in key_categories]
            if key_pain_points:
                score_points += 10
                reasons.append(f"Pain points align with AI voice agent solution ({len(key_pain_points)} relevant)")

        # 2. Budget Signals Analysis (max 25 points)
        if intel.budget_signals:
            has_explicit_budget = any(
                bs.signal_type in ["explicit_budget", "investment_ready"]
                for bs in intel.budget_signals
            )
            has_roi_focus = any(
                bs.signal_type == "roi_focus"
                for bs in intel.budget_signals
            )

            if has_explicit_budget:
                score_points += 25
                reasons.append("Explicit budget mentioned or investment ready")
            elif has_roi_focus:
                score_points += 15
                reasons.append("ROI-focused discussion indicates budget consideration")
            elif intel.budget_signals:
                score_points += 8
                reasons.append("Some budget signals detected")

        # 3. Urgency Analysis (max 20 points)
        if intel.urgency_level >= 4:
            score_points += 20
            reasons.append(f"High urgency level ({intel.urgency_level}/5)")
        elif intel.urgency_level >= 3:
            score_points += 12
            reasons.append(f"Moderate urgency level ({intel.urgency_level}/5)")
        elif intel.urgency_level >= 2:
            score_points += 5
            reasons.append("Some urgency indicated")

        # Timeline bonus
        if intel.decision_timeline:
            timeline_lower = intel.decision_timeline.lower()
            if any(word in timeline_lower for word in ["week", "asap", "immediately", "urgent"]):
                score_points += 10
                reasons.append(f"Short-term timeline: {intel.decision_timeline}")
            elif any(word in timeline_lower for word in ["month", "soon", "quarter"]):
                score_points += 5
                reasons.append(f"Medium-term timeline: {intel.decision_timeline}")

        # 4. Business Fit Analysis (max 15 points)
        if intel.business_type:
            target_types = {"plumbing", "electrical", "hvac", "building", "construction",
                          "landscaping", "cleaning", "pest control", "locksmith", "roofing"}
            if any(t in intel.business_type.lower() for t in target_types):
                score_points += 10
                reasons.append(f"Target business type: {intel.business_type}")

        if intel.location:
            if "australia" in intel.location.lower() or any(
                city in intel.location.lower()
                for city in ["sydney", "melbourne", "brisbane", "perth", "adelaide", "cairns"]
            ):
                score_points += 5
                reasons.append(f"Australian location: {intel.location}")

        if intel.business_size in ["small", "medium"]:
            score_points += 5
            reasons.append(f"Ideal business size: {intel.business_size}")

        # 5. Tool Dissatisfaction (max 10 points)
        if intel.current_tools:
            dissatisfied_tools = [
                ct for ct in intel.current_tools
                if ct.satisfaction == "dissatisfied"
            ]
            if dissatisfied_tools:
                score_points += 10
                reasons.append(f"Dissatisfied with {len(dissatisfied_tools)} current tool(s)")

        # Determine lead score based on points
        if score_points >= 60:
            intel.lead_score = LeadScore.HOT
        elif score_points >= 30:
            intel.lead_score = LeadScore.WARM
        else:
            intel.lead_score = LeadScore.COLD

        # Add summary reason
        reasons.insert(0, f"Total score: {score_points} points")
        intel.lead_score_reasons = reasons

        logger.info(f"Lead scored as {intel.lead_score.value} ({score_points} points)")

        return intel

    def score_lead(self, intel: DiscoveryIntelligence) -> DiscoveryIntelligence:
        """
        Public method to score or re-score a lead.

        Args:
            intel: DiscoveryIntelligence to score

        Returns:
            Updated DiscoveryIntelligence with lead score
        """
        return self._score_lead(intel)

    # =========================================================================
    # STORAGE METHODS
    # =========================================================================

    def _store_intelligence(self, intel: DiscoveryIntelligence) -> int:
        """
        Store extracted intelligence in PostgreSQL.

        Args:
            intel: DiscoveryIntelligence to store

        Returns:
            The id of the stored record
        """
        conn = self.db_pool.getconn()
        try:
            with conn.cursor() as cur:
                cur.execute("""
                    INSERT INTO discovery_intelligence (
                        transcript_id, business_name, business_type, location,
                        business_size, employee_count, annual_revenue_estimate,
                        pain_points, current_tools, budget_signals,
                        lead_score, lead_score_reasons, summary, key_quotes,
                        next_steps, urgency_level, decision_timeline,
                        extraction_confidence, raw_extraction, extracted_at
                    ) VALUES (
                        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
                        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
                    )
                    ON CONFLICT (transcript_id) DO UPDATE SET
                        business_name = EXCLUDED.business_name,
                        business_type = EXCLUDED.business_type,
                        location = EXCLUDED.location,
                        business_size = EXCLUDED.business_size,
                        employee_count = EXCLUDED.employee_count,
                        annual_revenue_estimate = EXCLUDED.annual_revenue_estimate,
                        pain_points = EXCLUDED.pain_points,
                        current_tools = EXCLUDED.current_tools,
                        budget_signals = EXCLUDED.budget_signals,
                        lead_score = EXCLUDED.lead_score,
                        lead_score_reasons = EXCLUDED.lead_score_reasons,
                        summary = EXCLUDED.summary,
                        key_quotes = EXCLUDED.key_quotes,
                        next_steps = EXCLUDED.next_steps,
                        urgency_level = EXCLUDED.urgency_level,
                        decision_timeline = EXCLUDED.decision_timeline,
                        extraction_confidence = EXCLUDED.extraction_confidence,
                        raw_extraction = EXCLUDED.raw_extraction,
                        updated_at = NOW()
                    RETURNING id
                """, (
                    intel.transcript_id,
                    intel.business_name,
                    intel.business_type,
                    intel.location,
                    intel.business_size,
                    intel.employee_count,
                    intel.annual_revenue_estimate,
                    json.dumps([pp.to_dict() for pp in intel.pain_points]),
                    json.dumps([ct.to_dict() for ct in intel.current_tools]),
                    json.dumps([bs.to_dict() for bs in intel.budget_signals]),
                    intel.lead_score.value,
                    json.dumps(intel.lead_score_reasons),
                    intel.summary,
                    json.dumps(intel.key_quotes),
                    json.dumps(intel.next_steps),
                    intel.urgency_level,
                    intel.decision_timeline,
                    intel.extraction_confidence,
                    json.dumps(intel.raw_extraction) if intel.raw_extraction else None,
                    intel.extracted_at
                ))

                row = cur.fetchone()
                conn.commit()

                intel_id = row[0]
                logger.info(f"Stored intelligence record {intel_id}")
                return intel_id

        except Exception as e:
            conn.rollback()
            logger.error(f"Failed to store intelligence: {e}")
            raise
        finally:
            self.db_pool.putconn(conn)

    def get_intelligence(self, transcript_id: int) -> Optional[DiscoveryIntelligence]:
        """
        Retrieve stored intelligence for a transcript.

        Args:
            transcript_id: ID of the transcript

        Returns:
            DiscoveryIntelligence or None if not found
        """
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute("""
                    SELECT * FROM discovery_intelligence
                    WHERE transcript_id = %s
                """, (transcript_id,))

                row = cur.fetchone()
                if not row:
                    return None

                return DiscoveryIntelligence.from_dict({
                    "id": row["id"],
                    "transcript_id": row["transcript_id"],
                    "business_name": row["business_name"],
                    "business_type": row["business_type"],
                    "location": row["location"],
                    "business_size": row["business_size"],
                    "employee_count": row["employee_count"],
                    "annual_revenue_estimate": row["annual_revenue_estimate"],
                    "pain_points": row["pain_points"] or [],
                    "current_tools": row["current_tools"] or [],
                    "budget_signals": row["budget_signals"] or [],
                    "lead_score": row["lead_score"],
                    "lead_score_reasons": row["lead_score_reasons"] or [],
                    "summary": row["summary"],
                    "key_quotes": row["key_quotes"] or [],
                    "next_steps": row["next_steps"] or [],
                    "urgency_level": row["urgency_level"],
                    "decision_timeline": row["decision_timeline"],
                    "extraction_confidence": row["extraction_confidence"],
                    "raw_extraction": row["raw_extraction"],
                    "extracted_at": row["extracted_at"].isoformat() if row["extracted_at"] else None
                })

        finally:
            self.db_pool.putconn(conn)

    def get_hot_leads(self, limit: int = 20) -> List[DiscoveryIntelligence]:
        """Get all hot leads, ordered by urgency."""
        return self._get_leads_by_score("hot", limit)

    def get_warm_leads(self, limit: int = 50) -> List[DiscoveryIntelligence]:
        """Get all warm leads, ordered by urgency."""
        return self._get_leads_by_score("warm", limit)

    def _get_leads_by_score(
        self,
        score: str,
        limit: int
    ) -> List[DiscoveryIntelligence]:
        """Get leads by score level."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute("""
                    SELECT * FROM discovery_intelligence
                    WHERE lead_score = %s
                    ORDER BY urgency_level DESC, extracted_at DESC
                    LIMIT %s
                """, (score, limit))

                rows = cur.fetchall()
                return [
                    DiscoveryIntelligence.from_dict({
                        "id": row["id"],
                        "transcript_id": row["transcript_id"],
                        "business_name": row["business_name"],
                        "business_type": row["business_type"],
                        "location": row["location"],
                        "business_size": row["business_size"],
                        "employee_count": row["employee_count"],
                        "annual_revenue_estimate": row["annual_revenue_estimate"],
                        "pain_points": row["pain_points"] or [],
                        "current_tools": row["current_tools"] or [],
                        "budget_signals": row["budget_signals"] or [],
                        "lead_score": row["lead_score"],
                        "lead_score_reasons": row["lead_score_reasons"] or [],
                        "summary": row["summary"],
                        "key_quotes": row["key_quotes"] or [],
                        "next_steps": row["next_steps"] or [],
                        "urgency_level": row["urgency_level"],
                        "decision_timeline": row["decision_timeline"],
                        "extraction_confidence": row["extraction_confidence"],
                        "raw_extraction": row["raw_extraction"],
                        "extracted_at": row["extracted_at"].isoformat() if row["extracted_at"] else None
                    })
                    for row in rows
                ]

        finally:
            self.db_pool.putconn(conn)

    def get_statistics(self) -> Dict[str, Any]:
        """Get intelligence extraction statistics."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute("""
                    SELECT
                        COUNT(*) as total_extractions,
                        COUNT(*) FILTER (WHERE lead_score = 'hot') as hot_leads,
                        COUNT(*) FILTER (WHERE lead_score = 'warm') as warm_leads,
                        COUNT(*) FILTER (WHERE lead_score = 'cold') as cold_leads,
                        AVG(extraction_confidence) as avg_confidence,
                        AVG(urgency_level) as avg_urgency
                    FROM discovery_intelligence
                """)

                row = cur.fetchone()

                # Get business type distribution
                cur.execute("""
                    SELECT business_type, COUNT(*) as count
                    FROM discovery_intelligence
                    WHERE business_type IS NOT NULL
                    GROUP BY business_type
                    ORDER BY count DESC
                    LIMIT 10
                """)
                business_types = {r["business_type"]: r["count"] for r in cur.fetchall()}

                return {
                    "total_extractions": row["total_extractions"],
                    "hot_leads": row["hot_leads"],
                    "warm_leads": row["warm_leads"],
                    "cold_leads": row["cold_leads"],
                    "avg_confidence": round(row["avg_confidence"] or 0, 2),
                    "avg_urgency": round(row["avg_urgency"] or 0, 2),
                    "business_type_distribution": business_types
                }

        finally:
            self.db_pool.putconn(conn)

    def close(self):
        """Close database connection pool."""
        if self.db_pool:
            self.db_pool.closeall()
            logger.info("Database connections closed")


# =============================================================================
# STANDALONE FUNCTIONS
# =============================================================================

def calculate_lead_score(intel: DiscoveryIntelligence) -> Tuple[LeadScore, List[str]]:
    """
    Standalone function to calculate lead score without extractor instance.

    Args:
        intel: DiscoveryIntelligence to score

    Returns:
        Tuple of (LeadScore, list of reasons)
    """
    # Create temporary extractor just for scoring
    class ScoreCalculator:
        def _score_lead(self, intel):
            # Copy the scoring logic
            score_points = 0
            reasons = []

            if intel.pain_points:
                pain_point_count = len(intel.pain_points)
                avg_severity = sum(pp.severity for pp in intel.pain_points) / pain_point_count

                if pain_point_count >= 3 and avg_severity >= 3.5:
                    score_points += 30
                    reasons.append(f"Multiple high-severity pain points ({pain_point_count})")
                elif pain_point_count >= 2:
                    score_points += 20
                elif pain_point_count >= 1:
                    score_points += 10

            if intel.budget_signals:
                has_explicit = any(bs.signal_type in ["explicit_budget", "investment_ready"]
                                  for bs in intel.budget_signals)
                if has_explicit:
                    score_points += 25
                elif intel.budget_signals:
                    score_points += 8

            if intel.urgency_level >= 4:
                score_points += 20
            elif intel.urgency_level >= 3:
                score_points += 12

            if score_points >= 60:
                lead_score = LeadScore.HOT
            elif score_points >= 30:
                lead_score = LeadScore.WARM
            else:
                lead_score = LeadScore.COLD

            reasons.insert(0, f"Total score: {score_points} points")
            return lead_score, reasons

    calculator = ScoreCalculator()
    return calculator._score_lead(intel)


# =============================================================================
# CLI AND TESTING
# =============================================================================

def test_extractor():
    """Test the intelligence extractor with a sample transcript."""

    print("=" * 60)
    print("INTELLIGENCE EXTRACTOR TEST")
    print("=" * 60)

    # Sample transcript for testing
    sample_transcript = """
    Agent: Hi, thanks for calling AgileAdapt. How can I help you today?

    Caller: Yeah, g'day mate. I'm Dave from Dave's Plumbing in Brisbane.
    Been hearing a lot about these AI receptionists and wondering if it might
    help us out.

    Agent: Great to hear from you Dave! Tell me a bit about your business.
    How many jobs are you handling?

    Caller: We're a team of 6 plumbers, been running for about 12 years now.
    We get maybe 30-40 calls a day, but the problem is we miss heaps of them
    when we're on the tools. My missus used to answer the phones but she's
    gone back to work now. We tried using a call answering service but they
    were charging us $300 a month and half the time they'd get the job details
    wrong.

    Agent: That sounds frustrating. What kind of issues are you seeing from
    missed calls?

    Caller: Mate, I reckon we're losing 5-10 jobs a week easy. That's probably
    $3-4k we're leaving on the table. And the ones that do get through to
    voicemail, half of them don't leave a message. Then I've got to call them
    back after a 10 hour day when I'm knackered.

    Agent: What systems are you using currently for managing jobs and scheduling?

    Caller: We've got ServiceM8 for job management which is alright, nothing
    fancy. Phone's just my mobile really. Been meaning to get a proper system
    but haven't had time.

    Agent: How soon are you looking to get something in place?

    Caller: Look, summer's coming up and that's our busy season. Ideally I'd
    want something running by end of October, so the next few weeks really.
    What would something like this cost?

    Agent: For a business your size, we're typically looking at $199-299 per
    month depending on the features you need.

    Caller: That's actually not bad if it works properly. Less than half what
    we were paying for the answering service. Can you send me some info? My
    email is dave@davesplumbing.com.au
    """

    try:
        # Initialize extractor (will create schema if needed)
        extractor = IntelligenceExtractor(init_schema=True)

        print("\n1. Extracting intelligence from sample transcript...")
        intel = extractor.extract_from_text(sample_transcript)

        print(f"\n2. Extraction Results:")
        print(f"   Business Name: {intel.business_name}")
        print(f"   Business Type: {intel.business_type}")
        print(f"   Location: {intel.location}")
        print(f"   Business Size: {intel.business_size}")
        print(f"   Employee Count: {intel.employee_count}")

        print(f"\n3. Pain Points ({len(intel.pain_points)}):")
        for i, pp in enumerate(intel.pain_points, 1):
            print(f"   {i}. [{pp.category.value}] {pp.description} (severity: {pp.severity})")

        print(f"\n4. Current Tools ({len(intel.current_tools)}):")
        for ct in intel.current_tools:
            print(f"   - {ct.name} ({ct.category})")

        print(f"\n5. Budget Signals ({len(intel.budget_signals)}):")
        for bs in intel.budget_signals:
            print(f"   - [{bs.signal_type}] {bs.description}")

        print(f"\n6. Lead Score: {intel.lead_score.value.upper()}")
        print(f"   Reasons:")
        for reason in intel.lead_score_reasons:
            print(f"   - {reason}")

        print(f"\n7. Summary: {intel.summary}")
        print(f"\n8. Urgency Level: {intel.urgency_level}/5")
        print(f"   Timeline: {intel.decision_timeline}")
        print(f"   Confidence: {intel.extraction_confidence}")

        # Get statistics
        print("\n9. Database Statistics:")
        stats = extractor.get_statistics()
        for key, value in stats.items():
            print(f"   {key}: {value}")

        extractor.close()
        print("\nTest completed successfully!")

    except Exception as e:
        print(f"\nTest failed: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    test_extractor()
