"""
Business Idea Extractor - Stories 10A-10E
==========================================
Mines 706,592 words of Kinan's conversations for revenue opportunities.

Stories:
- 10A: Business Idea Detector
- 10B: Idea Context Extractor
- 10C: Idea Deduplication & Clustering
- 10D: Business Ideas Master File
- 10E: Idea Synthesis Detector
"""

import json
import re
import hashlib
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional, Set, Tuple
from dataclasses import dataclass, field, asdict
from collections import defaultdict
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("business_idea_extractor")

# =============================================================================
# STORY 10A: Business Idea Detection Patterns
# =============================================================================

IDEA_PATTERNS = [
    # Direct idea expressions
    r"we could build\s+(.{10,200})",
    r"we should build\s+(.{10,200})",
    r"what if we\s+(.{10,200})",
    r"idea[:\s]+(.{10,200})",
    r"business idea[:\s]+(.{10,200})",
    r"product idea[:\s]+(.{10,200})",
    r"revenue stream[:\s]+(.{10,200})",
    r"we can monetize\s+(.{10,200})",
    r"we could sell\s+(.{10,200})",
    r"saas\s+(?:for|to|that)\s+(.{10,200})",
    r"platform\s+(?:for|to|that)\s+(.{10,200})",

    # Opportunity signals
    r"there's an opportunity\s+(.{10,200})",
    r"huge opportunity\s+(.{10,200})",
    r"untapped market\s+(.{10,200})",
    r"nobody is doing\s+(.{10,200})",
    r"gap in the market\s+(.{10,200})",

    # Revenue discussions
    r"could charge\s+(.{10,200})",
    r"pricing model\s+(.{10,200})",
    r"subscription\s+(?:for|model|service)\s+(.{10,200})",
    r"recurring revenue\s+(.{10,200})",
    r"monthly fee\s+(.{10,200})",

    # Product concepts
    r"ai agent\s+(?:for|that|to)\s+(.{10,200})",
    r"voice ai\s+(?:for|that|to)\s+(.{10,200})",
    r"automation\s+(?:for|that|to)\s+(.{10,200})",
    r"tool\s+(?:for|that|to)\s+(.{10,200})",
    r"app\s+(?:for|that|to)\s+(.{10,200})",
]

# Tools and platforms often associated with ideas
TOOL_PATTERNS = [
    r"\b(ghl|gohighlevel)\b",
    r"\b(vapi)\b",
    r"\b(telnyx)\b",
    r"\b(n8n)\b",
    r"\b(instantly)\b",
    r"\b(make\.com|zapier)\b",
    r"\b(claude|gpt|gemini)\b",
    r"\b(supabase|firebase)\b",
    r"\b(stripe)\b",
]

# Market segment indicators
MARKET_PATTERNS = [
    r"\b(tradies?|tradespeople|plumbers?|electricians?)\b",
    r"\b(small business|smb)\b",
    r"\b(agencies?|agency)\b",
    r"\b(saas|software)\b",
    r"\b(healthcare|medical)\b",
    r"\b(real estate|realtors?)\b",
    r"\b(e-?commerce)\b",
]

# =============================================================================
# STORY 10D: Data Structures
# =============================================================================

@dataclass
class BusinessIdea:
    """Structured business idea from conversations"""
    id: str
    title: str
    description: str
    problem_solved: str = ""
    target_market: str = ""
    mentions: int = 1
    first_mentioned: str = ""
    last_mentioned: str = ""
    related_tools: List[str] = field(default_factory=list)
    source_conversations: List[str] = field(default_factory=list)
    source_quotes: List[str] = field(default_factory=list)
    status: str = "raw"  # raw | validated | archived
    revenue_potential: Optional[str] = None
    synergy_candidates: List[str] = field(default_factory=list)
    confidence: float = 0.0

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)

@dataclass
class IdeaCluster:
    """Group of related ideas"""
    cluster_id: str
    theme: str
    ideas: List[str]  # idea IDs
    total_mentions: int
    first_seen: str
    last_seen: str

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)

# =============================================================================
# MAIN EXTRACTOR CLASS
# =============================================================================

class BusinessIdeaExtractor:
    """
    Extracts and clusters business ideas from Kinan's conversations.
    """

    ARCHIVE_PATH = Path("/mnt/e/genesis-system/KNOWLEDGE_GRAPH/creator_mind/conversations_archive.jsonl")
    OUTPUT_DIR = Path("/mnt/e/genesis-system/KNOWLEDGE_GRAPH/creator_mind/revenue")
    IDEAS_FILE = OUTPUT_DIR / "business_ideas.jsonl"
    CLUSTERS_FILE = OUTPUT_DIR / "idea_clusters.jsonl"
    SYNTHESIS_FILE = OUTPUT_DIR / "synthesis_candidates.jsonl"

    def __init__(self):
        self.ideas: Dict[str, BusinessIdea] = {}
        self.idea_hashes: Dict[str, str] = {}  # hash -> idea_id for dedup
        self.clusters: Dict[str, IdeaCluster] = {}

        # Compile patterns for efficiency
        self.idea_patterns = [re.compile(p, re.IGNORECASE | re.DOTALL) for p in IDEA_PATTERNS]
        self.tool_patterns = [re.compile(p, re.IGNORECASE) for p in TOOL_PATTERNS]
        self.market_patterns = [re.compile(p, re.IGNORECASE) for p in MARKET_PATTERNS]

    # =========================================================================
    # STORY 10A: Business Idea Detector
    # =========================================================================

    def detect_ideas(self, text: str) -> List[Tuple[str, str]]:
        """
        Detect business ideas in text using pattern matching.
        Returns list of (pattern_type, matched_text) tuples.
        """
        ideas = []

        for pattern in self.idea_patterns:
            matches = pattern.findall(text)
            for match in matches:
                # Clean up the match
                clean = match.strip()
                clean = re.sub(r'\s+', ' ', clean)
                # Truncate at sentence end
                if '.' in clean:
                    clean = clean.split('.')[0] + '.'
                if len(clean) > 20:  # Filter out noise
                    ideas.append(("pattern", clean))

        return ideas

    def extract_tools(self, text: str) -> List[str]:
        """Extract mentioned tools/platforms"""
        tools = set()
        for pattern in self.tool_patterns:
            matches = pattern.findall(text.lower())
            tools.update(matches)
        return list(tools)

    def extract_markets(self, text: str) -> List[str]:
        """Extract target market indicators"""
        markets = set()
        for pattern in self.market_patterns:
            matches = pattern.findall(text.lower())
            markets.update(matches)
        return list(markets)

    # =========================================================================
    # STORY 10B: Idea Context Extractor
    # =========================================================================

    def extract_context(self, text: str, idea_text: str) -> Dict[str, Any]:
        """
        Extract full context around an idea.
        """
        # Find the idea in the text and get surrounding context
        idx = text.lower().find(idea_text.lower()[:50])
        if idx > 0:
            # Get 500 chars before and after
            start = max(0, idx - 500)
            end = min(len(text), idx + len(idea_text) + 500)
            context = text[start:end]
        else:
            context = idea_text

        # Extract problem being solved
        problem_patterns = [
            r"problem[:\s]+(.{20,200})",
            r"pain point[:\s]+(.{20,200})",
            r"challenge[:\s]+(.{20,200})",
            r"struggle with\s+(.{20,200})",
            r"need\s+(?:a|to)\s+(.{20,200})",
        ]

        problem = ""
        for pattern in problem_patterns:
            match = re.search(pattern, context, re.IGNORECASE)
            if match:
                problem = match.group(1).strip()
                break

        return {
            "context": context,
            "problem_solved": problem,
            "tools": self.extract_tools(context),
            "markets": self.extract_markets(context)
        }

    # =========================================================================
    # STORY 10C: Deduplication & Clustering
    # =========================================================================

    def hash_idea(self, text: str) -> str:
        """Generate hash for idea deduplication"""
        # Normalize text for comparison
        normalized = text.lower()
        normalized = re.sub(r'[^\w\s]', '', normalized)
        normalized = re.sub(r'\s+', ' ', normalized).strip()
        return hashlib.md5(normalized[:100].encode()).hexdigest()[:12]

    def is_similar(self, text1: str, text2: str, threshold: float = 0.6) -> bool:
        """Simple similarity check using word overlap"""
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())

        if not words1 or not words2:
            return False

        intersection = len(words1 & words2)
        union = len(words1 | words2)

        return intersection / union >= threshold

    def find_or_create_idea(self, idea_text: str, context: Dict,
                           conv_uuid: str, timestamp: str) -> BusinessIdea:
        """Find existing similar idea or create new one"""
        idea_hash = self.hash_idea(idea_text)

        # Check for exact duplicate
        if idea_hash in self.idea_hashes:
            existing_id = self.idea_hashes[idea_hash]
            existing = self.ideas[existing_id]
            existing.mentions += 1
            existing.last_mentioned = timestamp
            if conv_uuid not in existing.source_conversations:
                existing.source_conversations.append(conv_uuid)
            existing.source_quotes.append(idea_text[:200])
            return existing

        # Check for similar ideas
        for idea in self.ideas.values():
            if self.is_similar(idea_text, idea.description):
                idea.mentions += 1
                idea.last_mentioned = timestamp
                if conv_uuid not in idea.source_conversations:
                    idea.source_conversations.append(conv_uuid)
                idea.source_quotes.append(idea_text[:200])
                # Update tools/markets if new ones found
                idea.related_tools = list(set(idea.related_tools + context.get('tools', [])))
                return idea

        # Create new idea
        idea_id = f"idea_{len(self.ideas) + 1:04d}"

        # Generate title from description
        title = idea_text[:60]
        if '.' in title:
            title = title.split('.')[0]
        title = title.strip()

        idea = BusinessIdea(
            id=idea_id,
            title=title,
            description=idea_text,
            problem_solved=context.get('problem_solved', ''),
            target_market=', '.join(context.get('markets', [])),
            first_mentioned=timestamp,
            last_mentioned=timestamp,
            related_tools=context.get('tools', []),
            source_conversations=[conv_uuid],
            source_quotes=[idea_text[:200]],
            confidence=0.5  # Base confidence
        )

        self.ideas[idea_id] = idea
        self.idea_hashes[idea_hash] = idea_id

        return idea

    # =========================================================================
    # STORY 10E: Synthesis Detection
    # =========================================================================

    def find_synthesis_candidates(self) -> List[Dict[str, Any]]:
        """
        Find ideas that could be combined based on:
        - Shared tools
        - Shared markets
        - Complementary problems
        """
        candidates = []
        idea_list = list(self.ideas.values())

        for i, idea1 in enumerate(idea_list):
            for idea2 in idea_list[i+1:]:
                synergy_score = 0
                synergy_reasons = []

                # Check tool overlap
                shared_tools = set(idea1.related_tools) & set(idea2.related_tools)
                if shared_tools:
                    synergy_score += len(shared_tools) * 0.3
                    synergy_reasons.append(f"Shared tools: {', '.join(shared_tools)}")

                # Check market overlap
                markets1 = set(idea1.target_market.lower().split(', '))
                markets2 = set(idea2.target_market.lower().split(', '))
                shared_markets = markets1 & markets2
                if shared_markets:
                    synergy_score += len(shared_markets) * 0.4
                    synergy_reasons.append(f"Shared markets: {', '.join(shared_markets)}")

                # Check if they solve related problems
                if idea1.problem_solved and idea2.problem_solved:
                    if self.is_similar(idea1.problem_solved, idea2.problem_solved, 0.3):
                        synergy_score += 0.3
                        synergy_reasons.append("Related problems")

                if synergy_score >= 0.5:
                    candidates.append({
                        "idea1_id": idea1.id,
                        "idea1_title": idea1.title,
                        "idea2_id": idea2.id,
                        "idea2_title": idea2.title,
                        "synergy_score": synergy_score,
                        "synergy_reasons": synergy_reasons,
                        "combined_mentions": idea1.mentions + idea2.mentions,
                        "synthesis_suggestion": f"Combine {idea1.title} with {idea2.title}"
                    })

                    # Update ideas with synergy candidates
                    idea1.synergy_candidates.append(idea2.id)
                    idea2.synergy_candidates.append(idea1.id)

        # Sort by synergy score
        candidates.sort(key=lambda x: x['synergy_score'], reverse=True)
        return candidates

    # =========================================================================
    # MAIN PROCESSING PIPELINE
    # =========================================================================

    def process_conversation(self, conv: Dict[str, Any]):
        """Process a single conversation for business ideas"""
        conv_uuid = conv['uuid']
        conv_name = conv.get('name', 'Untitled')
        timestamp = conv.get('created_at', '')

        # Only analyze human messages (Kinan's ideas)
        for msg in conv.get('messages', []):
            if msg.get('sender') != 'human':
                continue

            text = msg.get('text', '')
            if not text:
                continue

            # Detect ideas
            detected = self.detect_ideas(text)

            for pattern_type, idea_text in detected:
                # Get full context
                context = self.extract_context(text, idea_text)

                # Find or create idea
                self.find_or_create_idea(idea_text, context, conv_uuid, timestamp)

    def run(self):
        """Execute full business idea extraction pipeline"""
        logger.info("=" * 60)
        logger.info("PHASE 1B: BUSINESS IDEA EXTRACTION")
        logger.info("Stories 10A-10E")
        logger.info("=" * 60)

        # Load archived conversations
        logger.info(f"Loading conversations from {self.ARCHIVE_PATH}")
        conv_count = 0

        with open(self.ARCHIVE_PATH) as f:
            for line in f:
                conv = json.loads(line)
                self.process_conversation(conv)
                conv_count += 1

                if conv_count % 100 == 0:
                    logger.info(f"Processed {conv_count} conversations, found {len(self.ideas)} unique ideas")

        logger.info(f"\nProcessed {conv_count} conversations")
        logger.info(f"Extracted {len(self.ideas)} unique business ideas")

        # Find synthesis candidates
        logger.info("\nFinding synthesis candidates...")
        synthesis = self.find_synthesis_candidates()
        logger.info(f"Found {len(synthesis)} synthesis opportunities")

        # Save results
        self.save_results(synthesis)

        # Report
        self.generate_report(synthesis)

        return self.ideas, synthesis

    def save_results(self, synthesis: List[Dict]):
        """Save all results to JSONL files"""
        # Save ideas
        logger.info(f"Saving ideas to {self.IDEAS_FILE}")
        with open(self.IDEAS_FILE, 'w') as f:
            for idea in sorted(self.ideas.values(), key=lambda x: x.mentions, reverse=True):
                f.write(json.dumps(idea.to_dict()) + '\n')

        # Save synthesis candidates
        logger.info(f"Saving synthesis candidates to {self.SYNTHESIS_FILE}")
        with open(self.SYNTHESIS_FILE, 'w') as f:
            for candidate in synthesis:
                f.write(json.dumps(candidate) + '\n')

    def generate_report(self, synthesis: List[Dict]):
        """Generate summary report"""
        # Top ideas by mentions
        top_ideas = sorted(self.ideas.values(), key=lambda x: x.mentions, reverse=True)[:20]

        # Ideas with tools
        ideas_with_tools = [i for i in self.ideas.values() if i.related_tools]

        # Multi-mention ideas (obsessions)
        obsessions = [i for i in self.ideas.values() if i.mentions >= 3]

        logger.info("\n" + "=" * 60)
        logger.info("BUSINESS IDEAS EXTRACTION REPORT")
        logger.info("=" * 60)

        logger.info(f"\nTotal unique ideas: {len(self.ideas)}")
        logger.info(f"Ideas mentioned 3+ times (obsessions): {len(obsessions)}")
        logger.info(f"Ideas with tool associations: {len(ideas_with_tools)}")
        logger.info(f"Synthesis candidates: {len(synthesis)}")

        logger.info("\n--- TOP 10 IDEAS BY MENTIONS ---")
        for i, idea in enumerate(top_ideas[:10], 1):
            logger.info(f"{i}. [{idea.mentions}x] {idea.title}")
            if idea.related_tools:
                logger.info(f"   Tools: {', '.join(idea.related_tools)}")

        if obsessions:
            logger.info("\n--- OBSESSION IDEAS (3+ mentions) ---")
            for idea in sorted(obsessions, key=lambda x: x.mentions, reverse=True):
                logger.info(f"- [{idea.mentions}x] {idea.title}")

        if synthesis[:5]:
            logger.info("\n--- TOP 5 SYNTHESIS OPPORTUNITIES ---")
            for s in synthesis[:5]:
                logger.info(f"- {s['idea1_title'][:30]} + {s['idea2_title'][:30]}")
                logger.info(f"  Score: {s['synergy_score']:.2f} | Reasons: {', '.join(s['synergy_reasons'])}")


# =============================================================================
# CLI ENTRY POINT
# =============================================================================

if __name__ == "__main__":
    extractor = BusinessIdeaExtractor()
    ideas, synthesis = extractor.run()

    print(f"\nExtraction complete!")
    print(f"Ideas saved to: {extractor.IDEAS_FILE}")
    print(f"Synthesis saved to: {extractor.SYNTHESIS_FILE}")