"""
Genesis Audio Ingestion Pipeline
================================
Complete pipeline for processing AgileAdapt discovery call recordings.

Features:
- Audio file ingestion (MP3, M4A, WAV)
- Whisper transcription (local or API)
- Gemini-powered entity extraction
- PostgreSQL storage (discovery_transcripts table)
- Qdrant vector embeddings for semantic search

Usage:
    from core.discovery.audio_ingestion import AudioIngestionPipeline

    pipeline = AudioIngestionPipeline()
    result = pipeline.ingest("/path/to/recording.mp3")

    # Search similar calls
    similar = pipeline.search_similar("needs automation for scheduling")

IMPORTANT: NO SQLITE - Uses PostgreSQL (Elestio) only per Genesis rules.
"""

import os
import sys
import json
import hashlib
import tempfile
import uuid
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict, field
import logging

# Add genesis-memory to path for Elestio config
sys.path.insert(0, '/mnt/e/genesis-system/data/genesis-memory')

# Elestio infrastructure imports
try:
    from elestio_config import PostgresConfig, QdrantConfig
except ImportError:
    # Fallback for testing
    from dataclasses import dataclass as fallback_dataclass

    @fallback_dataclass
    class PostgresConfig:
        host: str = "postgresql-genesis-u50607.vm.elestio.app"
        port: int = 25432
        user: str = "postgres"
        password: str = "etY0eog17tD-dDuj--IRH"
        database: str = "postgres"

        @classmethod
        def get_connection_params(cls) -> dict:
            config = cls()
            return {
                "host": config.host,
                "port": config.port,
                "user": config.user,
                "password": config.password,
                "database": config.database
            }

    @fallback_dataclass
    class QdrantConfig:
        host: str = "qdrant-b3knu-u50607.vm.elestio.app"
        port: int = 6333
        api_key: str = "7b74e6621bd0e6650789f6662bca4cbf4143d3d1d710a0002b3b563973ca6876"

        @property
        def url(self) -> str:
            return f"https://{self.host}:{self.port}"

        @classmethod
        def get_client_params(cls) -> dict:
            config = cls()
            return {"url": config.url, "api_key": config.api_key}

# Database imports
import psycopg2
from psycopg2 import pool
from psycopg2.extras import RealDictCursor

# Vector store imports
try:
    from qdrant_client import QdrantClient
    from qdrant_client.http import models
    from qdrant_client.http.models import (
        Distance, VectorParams, PointStruct,
        Filter, FieldCondition, MatchValue, Range
    )
    QDRANT_AVAILABLE = True
except ImportError:
    QDRANT_AVAILABLE = False

# Whisper imports
try:
    import whisper
    WHISPER_LOCAL_AVAILABLE = True
except ImportError:
    WHISPER_LOCAL_AVAILABLE = False

try:
    import openai
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False

# Gemini for entity extraction
try:
    import google.generativeai as genai
    GENAI_AVAILABLE = True
except ImportError:
    GENAI_AVAILABLE = False

# Embedding generation
try:
    from sentence_transformers import SentenceTransformer
    SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
    SENTENCE_TRANSFORMERS_AVAILABLE = False


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# =============================================================================
# DATA CLASSES
# =============================================================================

@dataclass
class ExtractedIntel:
    """Intelligence extracted from a discovery call transcript."""
    business_name: Optional[str] = None
    contact_name: Optional[str] = None
    contact_email: Optional[str] = None
    contact_phone: Optional[str] = None
    industry: Optional[str] = None
    business_size: Optional[str] = None
    pain_points: List[str] = field(default_factory=list)
    current_tools: List[str] = field(default_factory=list)
    budget_signals: List[str] = field(default_factory=list)
    timeline: Optional[str] = None
    decision_makers: List[str] = field(default_factory=list)
    competitors_mentioned: List[str] = field(default_factory=list)
    key_requirements: List[str] = field(default_factory=list)
    objections: List[str] = field(default_factory=list)
    next_steps: List[str] = field(default_factory=list)
    sentiment: Optional[str] = None  # positive, neutral, negative
    buying_stage: Optional[str] = None  # awareness, consideration, decision
    confidence_score: float = 0.0
    raw_extraction: Optional[Dict] = None

    def to_dict(self) -> Dict:
        """Convert to dictionary for storage."""
        return asdict(self)

    @classmethod
    def from_dict(cls, data: Dict) -> 'ExtractedIntel':
        """Create from dictionary."""
        return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})


@dataclass
class DiscoveryTranscript:
    """A discovery call transcript with metadata."""
    transcript_id: str
    file_path: str
    file_hash: str
    transcript_text: str
    duration_seconds: float
    language: str
    extracted_intel: Optional[ExtractedIntel] = None
    embedding_id: Optional[str] = None
    created_at: Optional[str] = None
    status: str = "active"
    source: str = "agileadapt"

    def to_dict(self) -> Dict:
        """Convert to dictionary for storage."""
        data = asdict(self)
        if self.extracted_intel:
            data['extracted_intel'] = self.extracted_intel.to_dict()
        return data


@dataclass
class IngestResult:
    """Result of audio ingestion."""
    success: bool
    transcript_id: Optional[str] = None
    transcript: Optional[DiscoveryTranscript] = None
    intel: Optional[ExtractedIntel] = None
    error: Optional[str] = None
    processing_time: float = 0.0
    steps_completed: List[str] = field(default_factory=list)


# =============================================================================
# DATABASE SCHEMA
# =============================================================================

SCHEMA_SQL = """
-- Discovery Transcripts Table
CREATE TABLE IF NOT EXISTS discovery_transcripts (
    transcript_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    file_path TEXT NOT NULL,
    file_hash VARCHAR(64) UNIQUE NOT NULL,
    transcript_text TEXT NOT NULL,
    duration_seconds FLOAT DEFAULT 0,
    language VARCHAR(10) DEFAULT 'en',
    embedding_id UUID,
    status VARCHAR(20) DEFAULT 'active',
    source VARCHAR(50) DEFAULT 'agileadapt',
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);

-- Extracted Intel Table (normalized)
CREATE TABLE IF NOT EXISTS discovery_intel (
    intel_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    transcript_id UUID REFERENCES discovery_transcripts(transcript_id) ON DELETE CASCADE,
    business_name TEXT,
    contact_name TEXT,
    contact_email TEXT,
    contact_phone TEXT,
    industry TEXT,
    business_size TEXT,
    pain_points JSONB DEFAULT '[]',
    current_tools JSONB DEFAULT '[]',
    budget_signals JSONB DEFAULT '[]',
    timeline TEXT,
    decision_makers JSONB DEFAULT '[]',
    competitors_mentioned JSONB DEFAULT '[]',
    key_requirements JSONB DEFAULT '[]',
    objections JSONB DEFAULT '[]',
    next_steps JSONB DEFAULT '[]',
    sentiment VARCHAR(20),
    buying_stage VARCHAR(30),
    confidence_score FLOAT DEFAULT 0,
    raw_extraction JSONB,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);

-- Indexes for fast lookup
CREATE INDEX IF NOT EXISTS idx_transcripts_file_hash ON discovery_transcripts(file_hash);
CREATE INDEX IF NOT EXISTS idx_transcripts_status ON discovery_transcripts(status);
CREATE INDEX IF NOT EXISTS idx_transcripts_created ON discovery_transcripts(created_at DESC);
CREATE INDEX IF NOT EXISTS idx_intel_business ON discovery_intel(business_name);
CREATE INDEX IF NOT EXISTS idx_intel_industry ON discovery_intel(industry);
CREATE INDEX IF NOT EXISTS idx_intel_sentiment ON discovery_intel(sentiment);
CREATE INDEX IF NOT EXISTS idx_intel_buying_stage ON discovery_intel(buying_stage);
"""


# =============================================================================
# ENTITY EXTRACTION PROMPT
# =============================================================================

EXTRACTION_PROMPT = """Analyze this discovery call transcript and extract business intelligence.

TRANSCRIPT:
{transcript}

Extract the following information in JSON format. Be thorough and accurate.
If information is not present, use null or empty arrays.

{{
    "business_name": "Name of the business/prospect (string or null)",
    "contact_name": "Name of the main contact (string or null)",
    "contact_email": "Email address mentioned (string or null)",
    "contact_phone": "Phone number mentioned (string or null)",
    "industry": "Business industry/vertical (string or null)",
    "business_size": "Size indicator (e.g., '10-50 employees', 'SMB', 'enterprise')",
    "pain_points": ["List of pain points and problems mentioned"],
    "current_tools": ["Tools, software, or systems they currently use"],
    "budget_signals": ["Any mentions of budget, pricing concerns, or investment willingness"],
    "timeline": "When they need a solution (string or null)",
    "decision_makers": ["Names or roles of decision makers mentioned"],
    "competitors_mentioned": ["Competitor products or services mentioned"],
    "key_requirements": ["Must-have features or requirements"],
    "objections": ["Concerns, hesitations, or objections raised"],
    "next_steps": ["Agreed next steps or follow-up actions"],
    "sentiment": "Overall sentiment: 'positive', 'neutral', or 'negative'",
    "buying_stage": "Stage in buying journey: 'awareness', 'consideration', or 'decision'",
    "confidence_score": 0.0 to 1.0 indicating how confident you are in this extraction
}}

IMPORTANT:
- Extract actual information from the transcript, don't invent details
- Pain points should be specific problems, not generic statements
- Budget signals include any monetary discussions, ROI concerns, or price sensitivity
- Be conservative with confidence score - lower if transcript is unclear

Respond with ONLY the JSON object, no other text."""


# =============================================================================
# AUDIO INGESTION PIPELINE
# =============================================================================

class AudioIngestionPipeline:
    """
    Complete pipeline for ingesting discovery call audio files.

    Pipeline Steps:
    1. Validate and hash audio file
    2. Transcribe with Whisper (local or API)
    3. Extract entities with Gemini
    4. Store in PostgreSQL (discovery_transcripts + discovery_intel)
    5. Generate and store embeddings in Qdrant

    Attributes:
        db_pool: PostgreSQL connection pool
        qdrant_client: Qdrant vector client
        whisper_model: Local Whisper model (if available)
        embedding_model: Sentence transformer model
    """

    SUPPORTED_FORMATS = {'.mp3', '.m4a', '.wav', '.webm', '.ogg', '.flac'}
    COLLECTION_NAME = "discovery_embeddings"
    VECTOR_SIZE = 384  # all-MiniLM-L6-v2 default

    def __init__(
        self,
        use_local_whisper: bool = True,
        whisper_model_size: str = "base",
        gemini_api_key: Optional[str] = None,
        openai_api_key: Optional[str] = None
    ):
        """
        Initialize the audio ingestion pipeline.

        Args:
            use_local_whisper: Use local Whisper model (vs OpenAI API)
            whisper_model_size: Whisper model size (tiny, base, small, medium, large)
            gemini_api_key: Optional Gemini API key (falls back to env)
            openai_api_key: Optional OpenAI API key for Whisper API
        """
        self.use_local_whisper = use_local_whisper and WHISPER_LOCAL_AVAILABLE
        self.whisper_model_size = whisper_model_size

        # Initialize database pool
        self._init_database()

        # Initialize Qdrant
        self._init_qdrant()

        # Initialize Whisper
        self.whisper_model = None
        if self.use_local_whisper:
            self._init_whisper()

        # Initialize OpenAI for Whisper API fallback
        self.openai_client = None
        if OPENAI_AVAILABLE and not self.use_local_whisper:
            api_key = openai_api_key or os.environ.get("OPENAI_API_KEY")
            if api_key:
                self.openai_client = openai.OpenAI(api_key=api_key)

        # Initialize Gemini
        self.gemini_api_key = gemini_api_key or self._load_gemini_key()
        if GENAI_AVAILABLE and self.gemini_api_key:
            genai.configure(api_key=self.gemini_api_key)

        # Initialize embedding model
        self.embedding_model = None
        if SENTENCE_TRANSFORMERS_AVAILABLE:
            self._init_embedding_model()

        logger.info("AudioIngestionPipeline initialized")

    def _load_gemini_key(self) -> Optional[str]:
        """Load Gemini API key from environment or file."""
        # Check environment
        key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
        if key:
            return key

        # Check file
        key_file = Path("E:/genesis-system/Credentials/GoogleAIStudio-Gemini-AgileAdapt-API-KEY.txt")
        if key_file.exists():
            content = key_file.read_text().strip()
            if "=" in content:
                return content.split("=", 1)[1].strip()
            return content

        return None

    def _init_database(self):
        """Initialize PostgreSQL connection pool and schema."""
        try:
            self.db_pool = pool.ThreadedConnectionPool(
                2, 10,
                **PostgresConfig.get_connection_params()
            )

            # Create schema
            conn = self.db_pool.getconn()
            try:
                with conn.cursor() as cur:
                    cur.execute(SCHEMA_SQL)
                conn.commit()
                logger.info("Database schema initialized")
            finally:
                self.db_pool.putconn(conn)

        except Exception as e:
            logger.error(f"Database initialization failed: {e}")
            raise

    def _init_qdrant(self):
        """Initialize Qdrant client and collection."""
        if not QDRANT_AVAILABLE:
            logger.warning("Qdrant client not available")
            self.qdrant_client = None
            return

        try:
            config = QdrantConfig()
            self.qdrant_client = QdrantClient(
                url=config.url,
                api_key=config.api_key,
                timeout=30
            )

            # Ensure collection exists
            existing = {c.name for c in self.qdrant_client.get_collections().collections}
            if self.COLLECTION_NAME not in existing:
                self.qdrant_client.create_collection(
                    collection_name=self.COLLECTION_NAME,
                    vectors_config=VectorParams(
                        size=self.VECTOR_SIZE,
                        distance=Distance.COSINE
                    )
                )
                logger.info(f"Created Qdrant collection: {self.COLLECTION_NAME}")

        except Exception as e:
            logger.error(f"Qdrant initialization failed: {e}")
            self.qdrant_client = None

    def _init_whisper(self):
        """Initialize local Whisper model."""
        if not WHISPER_LOCAL_AVAILABLE:
            logger.warning("Local Whisper not available")
            return

        try:
            logger.info(f"Loading Whisper model: {self.whisper_model_size}")
            self.whisper_model = whisper.load_model(self.whisper_model_size)
            logger.info("Whisper model loaded")
        except Exception as e:
            logger.error(f"Whisper initialization failed: {e}")
            self.whisper_model = None

    def _init_embedding_model(self):
        """Initialize sentence transformer for embeddings."""
        try:
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            logger.info("Embedding model loaded")
        except Exception as e:
            logger.error(f"Embedding model initialization failed: {e}")

    # =========================================================================
    # MAIN INGESTION METHOD
    # =========================================================================

    def ingest(
        self,
        file_path: str,
        source: str = "agileadapt",
        force_reprocess: bool = False
    ) -> IngestResult:
        """
        Ingest an audio file through the complete pipeline.

        Args:
            file_path: Path to the audio file
            source: Source identifier (e.g., 'agileadapt')
            force_reprocess: Reprocess even if file was already ingested

        Returns:
            IngestResult with transcript and extracted intel
        """
        import time
        start_time = time.time()
        steps_completed = []

        try:
            # Step 1: Validate file
            path = Path(file_path)
            if not path.exists():
                return IngestResult(
                    success=False,
                    error=f"File not found: {file_path}",
                    processing_time=time.time() - start_time
                )

            if path.suffix.lower() not in self.SUPPORTED_FORMATS:
                return IngestResult(
                    success=False,
                    error=f"Unsupported format: {path.suffix}. Supported: {self.SUPPORTED_FORMATS}",
                    processing_time=time.time() - start_time
                )

            steps_completed.append("validate_file")

            # Step 2: Check for duplicate
            file_hash = self._compute_file_hash(path)
            existing = self._check_existing(file_hash)

            if existing and not force_reprocess:
                logger.info(f"File already ingested: {existing['transcript_id']}")
                return IngestResult(
                    success=True,
                    transcript_id=existing['transcript_id'],
                    error="File already ingested (use force_reprocess=True to reprocess)",
                    processing_time=time.time() - start_time,
                    steps_completed=steps_completed
                )

            steps_completed.append("check_duplicate")

            # Step 3: Transcribe
            transcript_text, duration, language = self._transcribe(str(path))
            if not transcript_text:
                return IngestResult(
                    success=False,
                    error="Transcription failed or produced empty result",
                    processing_time=time.time() - start_time,
                    steps_completed=steps_completed
                )

            steps_completed.append("transcribe")
            logger.info(f"Transcribed {duration:.1f}s of audio")

            # Step 4: Extract intel
            intel = self._extract_intel(transcript_text)
            steps_completed.append("extract_intel")
            logger.info(f"Extracted intel: {intel.business_name or 'Unknown'}")

            # Step 5: Store in PostgreSQL
            transcript_id = self._store_transcript(
                file_path=str(path),
                file_hash=file_hash,
                transcript_text=transcript_text,
                duration=duration,
                language=language,
                source=source
            )

            self._store_intel(transcript_id, intel)
            steps_completed.append("store_postgres")
            logger.info(f"Stored in PostgreSQL: {transcript_id}")

            # Step 6: Generate and store embeddings
            embedding_id = None
            if self.qdrant_client and self.embedding_model:
                embedding_id = self._store_embedding(
                    transcript_id=transcript_id,
                    text=transcript_text,
                    intel=intel
                )
                steps_completed.append("store_embedding")
                logger.info(f"Stored embedding: {embedding_id}")

            # Update transcript with embedding ID
            if embedding_id:
                self._update_embedding_id(transcript_id, embedding_id)

            # Build result
            transcript = DiscoveryTranscript(
                transcript_id=transcript_id,
                file_path=str(path),
                file_hash=file_hash,
                transcript_text=transcript_text,
                duration_seconds=duration,
                language=language,
                extracted_intel=intel,
                embedding_id=embedding_id,
                created_at=datetime.now(timezone.utc).isoformat(),
                status="active",
                source=source
            )

            return IngestResult(
                success=True,
                transcript_id=transcript_id,
                transcript=transcript,
                intel=intel,
                processing_time=time.time() - start_time,
                steps_completed=steps_completed
            )

        except Exception as e:
            logger.error(f"Ingestion failed: {e}")
            return IngestResult(
                success=False,
                error=str(e),
                processing_time=time.time() - start_time,
                steps_completed=steps_completed
            )

    # =========================================================================
    # PIPELINE STEPS
    # =========================================================================

    def _compute_file_hash(self, path: Path) -> str:
        """Compute SHA256 hash of file for deduplication."""
        sha256 = hashlib.sha256()
        with open(path, 'rb') as f:
            for chunk in iter(lambda: f.read(8192), b''):
                sha256.update(chunk)
        return sha256.hexdigest()

    def _check_existing(self, file_hash: str) -> Optional[Dict]:
        """Check if file was already ingested."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute(
                    "SELECT transcript_id FROM discovery_transcripts WHERE file_hash = %s",
                    (file_hash,)
                )
                return cur.fetchone()
        finally:
            self.db_pool.putconn(conn)

    def _transcribe(self, file_path: str) -> Tuple[str, float, str]:
        """
        Transcribe audio file using Whisper.

        Returns:
            Tuple of (transcript_text, duration_seconds, language)
        """
        # Try local Whisper first
        if self.whisper_model:
            result = self.whisper_model.transcribe(
                file_path,
                language=None,  # Auto-detect
                task="transcribe"
            )
            return (
                result.get("text", "").strip(),
                result.get("duration", 0) or self._estimate_duration(file_path),
                result.get("language", "en")
            )

        # Fallback to OpenAI Whisper API
        if self.openai_client:
            with open(file_path, "rb") as audio_file:
                result = self.openai_client.audio.transcriptions.create(
                    model="whisper-1",
                    file=audio_file,
                    response_format="verbose_json"
                )
                return (
                    result.text.strip(),
                    getattr(result, 'duration', 0) or self._estimate_duration(file_path),
                    getattr(result, 'language', 'en')
                )

        raise RuntimeError("No transcription method available (need Whisper local or OpenAI API)")

    def _estimate_duration(self, file_path: str) -> float:
        """Estimate audio duration from file size (rough approximation)."""
        try:
            size_bytes = os.path.getsize(file_path)
            # Rough estimate: ~1MB per minute for MP3 at 128kbps
            return (size_bytes / 1_000_000) * 60
        except Exception:
            return 0.0

    def _extract_intel(self, transcript_text: str) -> ExtractedIntel:
        """
        Extract business intelligence from transcript using Gemini.

        Args:
            transcript_text: The transcribed text

        Returns:
            ExtractedIntel with extracted information
        """
        if not GENAI_AVAILABLE or not self.gemini_api_key:
            logger.warning("Gemini not available, returning empty intel")
            return ExtractedIntel(confidence_score=0.0)

        try:
            model = genai.GenerativeModel('gemini-2.0-flash')

            prompt = EXTRACTION_PROMPT.format(
                transcript=transcript_text[:15000]  # Limit to ~15k chars
            )

            response = model.generate_content(
                prompt,
                generation_config=genai.GenerationConfig(
                    temperature=0.2,  # Lower for more consistent extraction
                    max_output_tokens=2048
                )
            )

            # Parse JSON response
            response_text = response.text.strip()

            # Clean up response (remove markdown code blocks if present)
            if response_text.startswith("```"):
                lines = response_text.split("\n")
                response_text = "\n".join(lines[1:-1])

            data = json.loads(response_text)

            intel = ExtractedIntel(
                business_name=data.get("business_name"),
                contact_name=data.get("contact_name"),
                contact_email=data.get("contact_email"),
                contact_phone=data.get("contact_phone"),
                industry=data.get("industry"),
                business_size=data.get("business_size"),
                pain_points=data.get("pain_points", []),
                current_tools=data.get("current_tools", []),
                budget_signals=data.get("budget_signals", []),
                timeline=data.get("timeline"),
                decision_makers=data.get("decision_makers", []),
                competitors_mentioned=data.get("competitors_mentioned", []),
                key_requirements=data.get("key_requirements", []),
                objections=data.get("objections", []),
                next_steps=data.get("next_steps", []),
                sentiment=data.get("sentiment"),
                buying_stage=data.get("buying_stage"),
                confidence_score=float(data.get("confidence_score", 0.0)),
                raw_extraction=data
            )

            return intel

        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse Gemini response as JSON: {e}")
            return ExtractedIntel(confidence_score=0.0)
        except Exception as e:
            logger.error(f"Intel extraction failed: {e}")
            return ExtractedIntel(confidence_score=0.0)

    def _store_transcript(
        self,
        file_path: str,
        file_hash: str,
        transcript_text: str,
        duration: float,
        language: str,
        source: str
    ) -> str:
        """Store transcript in PostgreSQL."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor() as cur:
                cur.execute("""
                    INSERT INTO discovery_transcripts
                    (file_path, file_hash, transcript_text, duration_seconds, language, source)
                    VALUES (%s, %s, %s, %s, %s, %s)
                    ON CONFLICT (file_hash) DO UPDATE SET
                        updated_at = NOW()
                    RETURNING transcript_id
                """, (file_path, file_hash, transcript_text, duration, language, source))

                result = cur.fetchone()
                conn.commit()
                return str(result[0])
        finally:
            self.db_pool.putconn(conn)

    def _store_intel(self, transcript_id: str, intel: ExtractedIntel):
        """Store extracted intel in PostgreSQL."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor() as cur:
                cur.execute("""
                    INSERT INTO discovery_intel (
                        transcript_id, business_name, contact_name, contact_email,
                        contact_phone, industry, business_size, pain_points,
                        current_tools, budget_signals, timeline, decision_makers,
                        competitors_mentioned, key_requirements, objections,
                        next_steps, sentiment, buying_stage, confidence_score, raw_extraction
                    ) VALUES (
                        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
                    )
                    ON CONFLICT DO NOTHING
                """, (
                    transcript_id,
                    intel.business_name,
                    intel.contact_name,
                    intel.contact_email,
                    intel.contact_phone,
                    intel.industry,
                    intel.business_size,
                    json.dumps(intel.pain_points),
                    json.dumps(intel.current_tools),
                    json.dumps(intel.budget_signals),
                    intel.timeline,
                    json.dumps(intel.decision_makers),
                    json.dumps(intel.competitors_mentioned),
                    json.dumps(intel.key_requirements),
                    json.dumps(intel.objections),
                    json.dumps(intel.next_steps),
                    intel.sentiment,
                    intel.buying_stage,
                    intel.confidence_score,
                    json.dumps(intel.raw_extraction) if intel.raw_extraction else None
                ))
                conn.commit()
        finally:
            self.db_pool.putconn(conn)

    def _store_embedding(
        self,
        transcript_id: str,
        text: str,
        intel: ExtractedIntel
    ) -> Optional[str]:
        """Generate and store embedding in Qdrant."""
        if not self.qdrant_client or not self.embedding_model:
            return None

        try:
            # Create rich text for embedding
            embedding_text = self._create_embedding_text(text, intel)

            # Generate embedding
            embedding = self.embedding_model.encode(embedding_text).tolist()

            # Create unique ID
            embedding_id = str(uuid.uuid4())

            # Store in Qdrant
            point = PointStruct(
                id=embedding_id,
                vector=embedding,
                payload={
                    "transcript_id": transcript_id,
                    "business_name": intel.business_name,
                    "industry": intel.industry,
                    "pain_points": intel.pain_points,
                    "sentiment": intel.sentiment,
                    "buying_stage": intel.buying_stage,
                    "confidence_score": intel.confidence_score,
                    "text_preview": text[:500],
                    "created_at": datetime.now(timezone.utc).isoformat()
                }
            )

            self.qdrant_client.upsert(
                collection_name=self.COLLECTION_NAME,
                points=[point]
            )

            return embedding_id

        except Exception as e:
            logger.error(f"Embedding storage failed: {e}")
            return None

    def _create_embedding_text(self, transcript: str, intel: ExtractedIntel) -> str:
        """Create rich text for embedding generation."""
        parts = [transcript[:2000]]  # First 2000 chars of transcript

        if intel.business_name:
            parts.append(f"Business: {intel.business_name}")
        if intel.industry:
            parts.append(f"Industry: {intel.industry}")
        if intel.pain_points:
            parts.append(f"Pain points: {', '.join(intel.pain_points)}")
        if intel.key_requirements:
            parts.append(f"Requirements: {', '.join(intel.key_requirements)}")

        return " | ".join(parts)

    def _update_embedding_id(self, transcript_id: str, embedding_id: str):
        """Update transcript with embedding ID."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor() as cur:
                cur.execute(
                    "UPDATE discovery_transcripts SET embedding_id = %s WHERE transcript_id = %s",
                    (embedding_id, transcript_id)
                )
                conn.commit()
        finally:
            self.db_pool.putconn(conn)

    # =========================================================================
    # SEARCH METHODS
    # =========================================================================

    def search_similar(
        self,
        query: str,
        limit: int = 10,
        min_score: float = 0.5,
        filters: Optional[Dict] = None
    ) -> List[Dict]:
        """
        Search for similar discovery calls using semantic search.

        Args:
            query: Search query text
            limit: Maximum results to return
            min_score: Minimum similarity score (0-1)
            filters: Optional filters (industry, sentiment, etc.)

        Returns:
            List of matching transcripts with scores
        """
        if not self.qdrant_client or not self.embedding_model:
            logger.warning("Semantic search not available")
            return []

        try:
            # Generate query embedding
            query_embedding = self.embedding_model.encode(query).tolist()

            # Build filter conditions
            query_filter = None
            if filters:
                conditions = []
                for key, value in filters.items():
                    conditions.append(
                        FieldCondition(key=key, match=MatchValue(value=value))
                    )
                if conditions:
                    query_filter = Filter(must=conditions)

            # Search
            results = self.qdrant_client.search(
                collection_name=self.COLLECTION_NAME,
                query_vector=query_embedding,
                query_filter=query_filter,
                limit=limit,
                score_threshold=min_score,
                with_payload=True
            )

            return [
                {
                    "transcript_id": r.payload.get("transcript_id"),
                    "business_name": r.payload.get("business_name"),
                    "industry": r.payload.get("industry"),
                    "pain_points": r.payload.get("pain_points", []),
                    "sentiment": r.payload.get("sentiment"),
                    "buying_stage": r.payload.get("buying_stage"),
                    "text_preview": r.payload.get("text_preview"),
                    "score": r.score
                }
                for r in results
            ]

        except Exception as e:
            logger.error(f"Search failed: {e}")
            return []

    def get_transcript(self, transcript_id: str) -> Optional[DiscoveryTranscript]:
        """Retrieve a transcript by ID."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute("""
                    SELECT t.*, i.*
                    FROM discovery_transcripts t
                    LEFT JOIN discovery_intel i ON t.transcript_id = i.transcript_id
                    WHERE t.transcript_id = %s
                """, (transcript_id,))

                row = cur.fetchone()
                if not row:
                    return None

                intel = ExtractedIntel(
                    business_name=row.get('business_name'),
                    contact_name=row.get('contact_name'),
                    contact_email=row.get('contact_email'),
                    contact_phone=row.get('contact_phone'),
                    industry=row.get('industry'),
                    business_size=row.get('business_size'),
                    pain_points=row.get('pain_points', []),
                    current_tools=row.get('current_tools', []),
                    budget_signals=row.get('budget_signals', []),
                    timeline=row.get('timeline'),
                    decision_makers=row.get('decision_makers', []),
                    competitors_mentioned=row.get('competitors_mentioned', []),
                    key_requirements=row.get('key_requirements', []),
                    objections=row.get('objections', []),
                    next_steps=row.get('next_steps', []),
                    sentiment=row.get('sentiment'),
                    buying_stage=row.get('buying_stage'),
                    confidence_score=row.get('confidence_score', 0.0)
                )

                return DiscoveryTranscript(
                    transcript_id=str(row['transcript_id']),
                    file_path=row['file_path'],
                    file_hash=row['file_hash'],
                    transcript_text=row['transcript_text'],
                    duration_seconds=row.get('duration_seconds', 0),
                    language=row.get('language', 'en'),
                    extracted_intel=intel,
                    embedding_id=str(row['embedding_id']) if row.get('embedding_id') else None,
                    created_at=row['created_at'].isoformat() if row.get('created_at') else None,
                    status=row.get('status', 'active'),
                    source=row.get('source', 'agileadapt')
                )

        finally:
            self.db_pool.putconn(conn)

    def list_transcripts(
        self,
        limit: int = 50,
        offset: int = 0,
        source: Optional[str] = None,
        status: str = "active"
    ) -> List[Dict]:
        """List all transcripts with basic info."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                query = """
                    SELECT
                        t.transcript_id, t.file_path, t.duration_seconds,
                        t.created_at, t.status, t.source,
                        i.business_name, i.industry, i.sentiment, i.buying_stage
                    FROM discovery_transcripts t
                    LEFT JOIN discovery_intel i ON t.transcript_id = i.transcript_id
                    WHERE t.status = %s
                """
                params = [status]

                if source:
                    query += " AND t.source = %s"
                    params.append(source)

                query += " ORDER BY t.created_at DESC LIMIT %s OFFSET %s"
                params.extend([limit, offset])

                cur.execute(query, params)

                return [dict(row) for row in cur.fetchall()]

        finally:
            self.db_pool.putconn(conn)

    # =========================================================================
    # STATISTICS & HEALTH
    # =========================================================================

    def get_stats(self) -> Dict[str, Any]:
        """Get pipeline statistics."""
        conn = self.db_pool.getconn()
        try:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute("""
                    SELECT
                        COUNT(*) as total_transcripts,
                        COUNT(*) FILTER (WHERE status = 'active') as active,
                        SUM(duration_seconds) / 3600 as total_hours,
                        AVG(duration_seconds) / 60 as avg_minutes
                    FROM discovery_transcripts
                """)
                transcript_stats = cur.fetchone()

                cur.execute("""
                    SELECT
                        industry, COUNT(*) as count
                    FROM discovery_intel
                    WHERE industry IS NOT NULL
                    GROUP BY industry
                    ORDER BY count DESC
                    LIMIT 10
                """)
                by_industry = {row['industry']: row['count'] for row in cur.fetchall()}

                cur.execute("""
                    SELECT
                        sentiment, COUNT(*) as count
                    FROM discovery_intel
                    WHERE sentiment IS NOT NULL
                    GROUP BY sentiment
                """)
                by_sentiment = {row['sentiment']: row['count'] for row in cur.fetchall()}

                return {
                    "transcripts": {
                        "total": transcript_stats['total_transcripts'],
                        "active": transcript_stats['active'],
                        "total_hours": round(transcript_stats['total_hours'] or 0, 1),
                        "avg_minutes": round(transcript_stats['avg_minutes'] or 0, 1)
                    },
                    "by_industry": by_industry,
                    "by_sentiment": by_sentiment,
                    "qdrant_available": self.qdrant_client is not None,
                    "whisper_available": self.whisper_model is not None,
                    "gemini_available": GENAI_AVAILABLE and self.gemini_api_key is not None
                }

        finally:
            self.db_pool.putconn(conn)

    def health_check(self) -> Dict[str, bool]:
        """Check health of all components."""
        health = {
            "postgresql": False,
            "qdrant": False,
            "whisper": self.whisper_model is not None,
            "gemini": GENAI_AVAILABLE and self.gemini_api_key is not None,
            "embeddings": self.embedding_model is not None
        }

        # Test PostgreSQL
        try:
            conn = self.db_pool.getconn()
            with conn.cursor() as cur:
                cur.execute("SELECT 1")
            self.db_pool.putconn(conn)
            health["postgresql"] = True
        except Exception:
            pass

        # Test Qdrant
        if self.qdrant_client:
            try:
                self.qdrant_client.get_collections()
                health["qdrant"] = True
            except Exception:
                pass

        return health

    def close(self):
        """Close all connections."""
        if hasattr(self, 'db_pool') and self.db_pool:
            self.db_pool.closeall()
        logger.info("AudioIngestionPipeline closed")


# =============================================================================
# CLI INTERFACE
# =============================================================================

def main():
    """Command-line interface for audio ingestion."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Ingest discovery call recordings",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python audio_ingestion.py /path/to/call.mp3
  python audio_ingestion.py /path/to/call.m4a --source agileadapt
  python audio_ingestion.py --health
  python audio_ingestion.py --stats
        """
    )

    parser.add_argument("file", nargs="?", help="Audio file to ingest")
    parser.add_argument("--source", default="agileadapt", help="Source identifier")
    parser.add_argument("--force", action="store_true", help="Force reprocessing")
    parser.add_argument("--health", action="store_true", help="Run health check")
    parser.add_argument("--stats", action="store_true", help="Show statistics")
    parser.add_argument("--search", help="Search query")
    parser.add_argument("--local-whisper", action="store_true", default=True,
                        help="Use local Whisper model (default)")
    parser.add_argument("--whisper-size", default="base",
                        choices=["tiny", "base", "small", "medium", "large"],
                        help="Whisper model size")

    args = parser.parse_args()

    pipeline = AudioIngestionPipeline(
        use_local_whisper=args.local_whisper,
        whisper_model_size=args.whisper_size
    )

    try:
        if args.health:
            print("Health Check:")
            health = pipeline.health_check()
            for component, status in health.items():
                emoji = "OK" if status else "FAIL"
                print(f"  {component}: {emoji}")
            return

        if args.stats:
            print("Statistics:")
            stats = pipeline.get_stats()
            print(json.dumps(stats, indent=2))
            return

        if args.search:
            print(f"Searching: {args.search}")
            results = pipeline.search_similar(args.search)
            for r in results:
                print(f"\n  [{r['score']:.2f}] {r.get('business_name', 'Unknown')}")
                print(f"    Industry: {r.get('industry', 'N/A')}")
                print(f"    Pain points: {', '.join(r.get('pain_points', []))[:100]}")
            return

        if not args.file:
            parser.print_help()
            return

        print(f"Ingesting: {args.file}")
        result = pipeline.ingest(
            file_path=args.file,
            source=args.source,
            force_reprocess=args.force
        )

        if result.success:
            print(f"\nSuccess! Transcript ID: {result.transcript_id}")
            print(f"Processing time: {result.processing_time:.1f}s")
            print(f"Steps: {', '.join(result.steps_completed)}")

            if result.intel:
                print(f"\nExtracted Intel:")
                print(f"  Business: {result.intel.business_name or 'Unknown'}")
                print(f"  Industry: {result.intel.industry or 'Unknown'}")
                print(f"  Sentiment: {result.intel.sentiment or 'Unknown'}")
                print(f"  Pain points: {result.intel.pain_points}")
                print(f"  Confidence: {result.intel.confidence_score:.0%}")
        else:
            print(f"\nFailed: {result.error}")

    finally:
        pipeline.close()


if __name__ == "__main__":
    main()
