"""
AIVA Consciousness Enrichment: Creator Mind Absorber
=====================================================
Project GENESIS-MIND - 63 Atomic Stories

Transforms 615 Claude conversations (315MB) into AIVA's foundational
understanding of Kinan's mind, philosophy, and strategic vision.

Stories Implemented:
- Story 1: Streaming JSON Parser
- Story 2: Pydantic Schema Validators
- Story 3: Message Content Extractor
- Story 4: Chronological Index Builder
- Story 5: Message Deduplicator
- Story 6: Human/Assistant Splitter
- Story 7: Conversation Metadata Enricher
- Story 8: Progress Checkpoint System
- Story 9: Raw Data Archive
- Story 10: Processing Stats Dashboard
"""

import ijson
import json
import hashlib
import sys
sys.path.append('/mnt/e/genesis-system/data/genesis-memory')
from elestio_config import PostgresConfig
import psycopg2
import logging
import time
from pathlib import Path
from datetime import datetime
from typing import Optional, List, Dict, Any, Generator
from dataclasses import dataclass, field, asdict
from pydantic import BaseModel, Field, validator
from collections import defaultdict
import re

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("creator_mind_absorber")

# =============================================================================
# STORY 2: Pydantic Schema Validators
# =============================================================================

class ContentBlock(BaseModel):
    """Rich content block within a message"""
    start_timestamp: Optional[str] = None
    stop_timestamp: Optional[str] = None
    flags: Optional[Any] = None
    type: str = "text"
    text: str = ""
    citations: List[Any] = Field(default_factory=list)

class ChatMessage(BaseModel):
    """Individual message in a conversation"""
    uuid: str
    text: str = ""
    content: List[ContentBlock] = Field(default_factory=list)
    sender: str  # "human" or "assistant"
    created_at: str
    updated_at: Optional[str] = None
    attachments: List[Any] = Field(default_factory=list)
    files: List[Any] = Field(default_factory=list)

    @validator('sender')
    def normalize_sender(cls, v):
        """Normalize sender to human/assistant"""
        v = v.lower()
        if v in ['human', 'user', 'kinan']:
            return 'human'
        return 'assistant'

class Account(BaseModel):
    """Account information"""
    uuid: str

class Conversation(BaseModel):
    """Full conversation export from Claude"""
    uuid: str
    name: str = "Untitled"
    summary: Optional[str] = None
    created_at: str
    updated_at: str
    account: Optional[Account] = None
    chat_messages: List[ChatMessage] = Field(default_factory=list)

# =============================================================================
# STORY 7: Enriched Conversation Metadata
# =============================================================================

@dataclass
class EnrichedConversation:
    """Conversation with derived metadata"""
    uuid: str
    name: str
    created_at: datetime
    updated_at: datetime
    message_count: int
    human_message_count: int
    assistant_message_count: int
    human_word_count: int
    assistant_word_count: int
    duration_minutes: float
    title_keywords: List[str]
    messages: List[Dict[str, Any]]
    message_hashes: List[str]  # For deduplication

    def to_dict(self) -> Dict[str, Any]:
        return {
            **asdict(self),
            'created_at': self.created_at.isoformat(),
            'updated_at': self.updated_at.isoformat()
        }

# =============================================================================
# STORY 8: Checkpoint System
# =============================================================================

@dataclass
class ProcessingCheckpoint:
    """Checkpoint for resumable processing"""
    last_processed_index: int
    last_processed_uuid: str
    timestamp: str
    stats: Dict[str, Any]

    @classmethod
    def load(cls, path: Path) -> Optional['ProcessingCheckpoint']:
        if path.exists():
            with open(path) as f:
                data = json.load(f)
                return cls(**data)
        return None

    def save(self, path: Path):
        with open(path, 'w') as f:
            json.dump(asdict(self), f, indent=2)

# =============================================================================
# STORY 10: Processing Stats
# =============================================================================

@dataclass
class ProcessingStats:
    """Real-time processing statistics"""
    total_conversations: int = 0
    processed_conversations: int = 0
    total_messages: int = 0
    human_messages: int = 0
    assistant_messages: int = 0
    total_human_words: int = 0
    total_assistant_words: int = 0
    duplicate_messages: int = 0
    errors: int = 0
    start_time: float = field(default_factory=time.time)

    @property
    def rate(self) -> float:
        elapsed = time.time() - self.start_time
        return self.processed_conversations / elapsed if elapsed > 0 else 0

    @property
    def eta_seconds(self) -> float:
        remaining = self.total_conversations - self.processed_conversations
        return remaining / self.rate if self.rate > 0 else 0

    def display(self):
        logger.info(f"""
╔══════════════════════════════════════════════════════════╗
║  CREATOR MIND ABSORPTION - Processing Stats              ║
╠══════════════════════════════════════════════════════════╣
║  Processed: {self.processed_conversations:>6} / {self.total_conversations:<6} ({100*self.processed_conversations/max(1,self.total_conversations):.1f}%)
║  Messages:  {self.total_messages:>6} (Human: {self.human_messages}, Assistant: {self.assistant_messages})
║  Words:     Human: {self.total_human_words:,} | Assistant: {self.total_assistant_words:,}
║  Duplicates: {self.duplicate_messages:>5} | Errors: {self.errors}
║  Rate: {self.rate:.2f} conv/sec | ETA: {self.eta_seconds/60:.1f} min
╚══════════════════════════════════════════════════════════╝
""")

# =============================================================================
# MAIN ABSORBER CLASS
# =============================================================================

class CreatorMindAbsorber:
    """
    Main orchestrator for absorbing Kinan's 615 conversations into AIVA's
    knowledge graph and semantic memory.
    """

    CONVERSATIONS_PATH = Path("/mnt/e/genesis-system/data/Kinan Claude Conversations up until Jan 12th 2026/conversations.json")
    OUTPUT_BASE = Path("/mnt/e/genesis-system/KNOWLEDGE_GRAPH/creator_mind")
    CHECKPOINT_PATH = OUTPUT_BASE / "processing_checkpoint.json"
    ARCHIVE_PATH = OUTPUT_BASE / "conversations_archive.jsonl"

    def __init__(self):
        self.stats = ProcessingStats()
        self.seen_hashes: set = set()  # For deduplication
        self.checkpoint: Optional[ProcessingCheckpoint] = None
        self._init_database()

    # =========================================================================
    # STORY 4: Chronological Index (PostgreSQL via Elestio)
    # =========================================================================

    def _init_database(self):
        """Initialize PostgreSQL database for chronological indexing"""
        self.conn = psycopg2.connect(**PostgresConfig.get_connection_params())
        cur = self.conn.cursor()
        cur.execute("""
            CREATE TABLE IF NOT EXISTS cm_conversations (
                uuid TEXT PRIMARY KEY,
                name TEXT,
                created_at TEXT,
                updated_at TEXT,
                message_count INTEGER,
                human_messages INTEGER,
                assistant_messages INTEGER,
                human_words INTEGER,
                assistant_words INTEGER,
                duration_minutes REAL,
                title_keywords TEXT,
                processed_at TEXT
            )
        """)
        cur.execute("""
            CREATE TABLE IF NOT EXISTS cm_messages (
                hash TEXT PRIMARY KEY,
                conversation_uuid TEXT,
                sender TEXT,
                word_count INTEGER,
                created_at TEXT,
                text_preview TEXT,
                FOREIGN KEY (conversation_uuid) REFERENCES cm_conversations(uuid)
            )
        """)
        cur.execute("""
            CREATE INDEX IF NOT EXISTS idx_cm_conv_created ON cm_conversations(created_at)
        """)
        cur.execute("""
            CREATE INDEX IF NOT EXISTS idx_cm_msg_sender ON cm_messages(sender)
        """)
        self.conn.commit()
        cur.close()

    # =========================================================================
    # STORY 1: Streaming JSON Parser
    # =========================================================================

    def stream_conversations(self) -> Generator[Dict[str, Any], None, None]:
        """
        Memory-efficient streaming parser for 315MB conversations.json
        Uses ijson for incremental parsing without loading entire file.
        """
        logger.info(f"Starting stream parse of {self.CONVERSATIONS_PATH}")

        with open(self.CONVERSATIONS_PATH, 'rb') as f:
            # ijson parses incrementally, yielding one conversation at a time
            parser = ijson.items(f, 'item')
            for conv in parser:
                yield conv

    # =========================================================================
    # STORY 3: Message Content Extractor
    # =========================================================================

    def extract_message_text(self, message: Dict[str, Any]) -> str:
        """
        Extract text from both 'text' field and 'content[]' blocks.
        Handles rich content with timestamps.
        """
        # Primary text field
        text = message.get('text', '')

        # Also check content blocks for additional text
        content_blocks = message.get('content', [])
        if content_blocks:
            block_texts = []
            for block in content_blocks:
                if isinstance(block, dict) and block.get('text'):
                    block_texts.append(block['text'])
            if block_texts:
                # Use content blocks if they have more detail
                combined = ' '.join(block_texts)
                if len(combined) > len(text):
                    text = combined

        return text.strip()

    # =========================================================================
    # STORY 5: Message Deduplicator
    # =========================================================================

    def hash_message(self, text: str, sender: str, timestamp: str) -> str:
        """Generate hash for deduplication"""
        content = f"{sender}:{timestamp}:{text[:500]}"
        return hashlib.md5(content.encode()).hexdigest()

    def is_duplicate(self, msg_hash: str) -> bool:
        """Check if message is duplicate"""
        if msg_hash in self.seen_hashes:
            self.stats.duplicate_messages += 1
            return True
        self.seen_hashes.add(msg_hash)
        return False

    # =========================================================================
    # STORY 6: Human/Assistant Splitter
    # =========================================================================

    def split_by_sender(self, messages: List[Dict]) -> Dict[str, List[Dict]]:
        """Separate messages by sender for distinct analysis"""
        result = {'human': [], 'assistant': []}
        for msg in messages:
            sender = msg.get('sender', 'assistant').lower()
            if sender in ['human', 'user', 'kinan']:
                result['human'].append(msg)
            else:
                result['assistant'].append(msg)
        return result

    # =========================================================================
    # STORY 7: Conversation Metadata Enricher
    # =========================================================================

    def extract_title_keywords(self, title: str) -> List[str]:
        """Extract keywords from conversation title"""
        # Remove common stop words
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
        words = re.findall(r'\b\w+\b', title.lower())
        return [w for w in words if w not in stop_words and len(w) > 2]

    def enrich_conversation(self, raw_conv: Dict[str, Any]) -> EnrichedConversation:
        """Add derived metadata to conversation"""
        messages = raw_conv.get('chat_messages', [])

        # Parse timestamps
        created_at = datetime.fromisoformat(raw_conv['created_at'].replace('Z', '+00:00'))
        updated_at = datetime.fromisoformat(raw_conv['updated_at'].replace('Z', '+00:00'))

        # Calculate duration
        duration_minutes = (updated_at - created_at).total_seconds() / 60

        # Process messages
        processed_messages = []
        message_hashes = []
        human_words = 0
        assistant_words = 0
        human_count = 0
        assistant_count = 0

        for msg in messages:
            text = self.extract_message_text(msg)
            sender = msg.get('sender', 'assistant').lower()
            if sender in ['human', 'user', 'kinan']:
                sender = 'human'
            else:
                sender = 'assistant'

            word_count = len(text.split())
            msg_hash = self.hash_message(text, sender, msg.get('created_at', ''))

            if sender == 'human':
                human_words += word_count
                human_count += 1
            else:
                assistant_words += word_count
                assistant_count += 1

            processed_messages.append({
                'uuid': msg.get('uuid', ''),
                'sender': sender,
                'text': text,
                'word_count': word_count,
                'created_at': msg.get('created_at', ''),
                'hash': msg_hash
            })
            message_hashes.append(msg_hash)

        return EnrichedConversation(
            uuid=raw_conv['uuid'],
            name=raw_conv.get('name', 'Untitled'),
            created_at=created_at,
            updated_at=updated_at,
            message_count=len(messages),
            human_message_count=human_count,
            assistant_message_count=assistant_count,
            human_word_count=human_words,
            assistant_word_count=assistant_words,
            duration_minutes=duration_minutes,
            title_keywords=self.extract_title_keywords(raw_conv.get('name', '')),
            messages=processed_messages,
            message_hashes=message_hashes
        )

    # =========================================================================
    # STORY 9: Raw Data Archive
    # =========================================================================

    def archive_conversation(self, enriched: EnrichedConversation):
        """Store processed conversation as JSONL for re-analysis"""
        with open(self.ARCHIVE_PATH, 'a') as f:
            f.write(json.dumps(enriched.to_dict()) + '\n')

    def index_conversation(self, enriched: EnrichedConversation):
        """Add to PostgreSQL index for chronological queries"""
        cur = self.conn.cursor()
        cur.execute("""
            INSERT INTO cm_conversations VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON CONFLICT (uuid) DO UPDATE SET
                name = EXCLUDED.name,
                created_at = EXCLUDED.created_at,
                updated_at = EXCLUDED.updated_at,
                message_count = EXCLUDED.message_count,
                human_messages = EXCLUDED.human_messages,
                assistant_messages = EXCLUDED.assistant_messages,
                human_words = EXCLUDED.human_words,
                assistant_words = EXCLUDED.assistant_words,
                duration_minutes = EXCLUDED.duration_minutes,
                title_keywords = EXCLUDED.title_keywords,
                processed_at = EXCLUDED.processed_at
        """, (
            enriched.uuid,
            enriched.name,
            enriched.created_at.isoformat(),
            enriched.updated_at.isoformat(),
            enriched.message_count,
            enriched.human_message_count,
            enriched.assistant_message_count,
            enriched.human_word_count,
            enriched.assistant_word_count,
            enriched.duration_minutes,
            json.dumps(enriched.title_keywords),
            datetime.now().isoformat()
        ))

        # Index individual messages
        for msg in enriched.messages:
            if not self.is_duplicate(msg['hash']):
                cur.execute("""
                    INSERT INTO cm_messages VALUES (%s, %s, %s, %s, %s, %s)
                    ON CONFLICT DO NOTHING
                """, (
                    msg['hash'],
                    enriched.uuid,
                    msg['sender'],
                    msg['word_count'],
                    msg['created_at'],
                    msg['text'][:500]  # Preview only
                ))

        self.conn.commit()
        cur.close()

    # =========================================================================
    # STORY 8: Checkpoint System
    # =========================================================================

    def save_checkpoint(self, index: int, uuid: str):
        """Save processing checkpoint for resume capability"""
        checkpoint = ProcessingCheckpoint(
            last_processed_index=index,
            last_processed_uuid=uuid,
            timestamp=datetime.now().isoformat(),
            stats={
                'processed': self.stats.processed_conversations,
                'messages': self.stats.total_messages,
                'duplicates': self.stats.duplicate_messages
            }
        )
        checkpoint.save(self.CHECKPOINT_PATH)

    def load_checkpoint(self) -> Optional[int]:
        """Load checkpoint and return resume index"""
        self.checkpoint = ProcessingCheckpoint.load(self.CHECKPOINT_PATH)
        if self.checkpoint:
            logger.info(f"Resuming from checkpoint: conversation {self.checkpoint.last_processed_index}")
            return self.checkpoint.last_processed_index
        return None

    # =========================================================================
    # MAIN PROCESSING PIPELINE
    # =========================================================================

    def count_total(self) -> int:
        """Quick count of total conversations without full parse"""
        count = 0
        with open(self.CONVERSATIONS_PATH, 'rb') as f:
            for _ in ijson.items(f, 'item'):
                count += 1
        return count

    def run_phase1_extraction(self):
        """
        Execute Phase 1: Data Extraction & Normalization
        Stories 1-10
        """
        logger.info("=" * 60)
        logger.info("PHASE 1: DATA EXTRACTION & NORMALIZATION")
        logger.info("=" * 60)

        # Get total count for progress tracking
        logger.info("Counting total conversations...")
        self.stats.total_conversations = self.count_total()
        logger.info(f"Total conversations: {self.stats.total_conversations}")

        # Check for resume point
        resume_index = self.load_checkpoint()

        # Clear archive if starting fresh
        if resume_index is None and self.ARCHIVE_PATH.exists():
            self.ARCHIVE_PATH.unlink()

        # Process each conversation
        for index, raw_conv in enumerate(self.stream_conversations()):
            # Skip already processed if resuming
            if resume_index and index <= resume_index:
                continue

            try:
                # Enrich conversation with metadata
                enriched = self.enrich_conversation(raw_conv)

                # Archive for re-analysis
                self.archive_conversation(enriched)

                # Index in PostgreSQL
                self.index_conversation(enriched)

                # Update stats
                self.stats.processed_conversations += 1
                self.stats.total_messages += enriched.message_count
                self.stats.human_messages += enriched.human_message_count
                self.stats.assistant_messages += enriched.assistant_message_count
                self.stats.total_human_words += enriched.human_word_count
                self.stats.total_assistant_words += enriched.assistant_word_count

                # Checkpoint every 50 conversations
                if index % 50 == 0:
                    self.save_checkpoint(index, enriched.uuid)
                    self.stats.display()

            except Exception as e:
                logger.error(f"Error processing conversation {index}: {e}")
                self.stats.errors += 1

        # Final checkpoint
        self.save_checkpoint(self.stats.processed_conversations, "COMPLETE")
        self.stats.display()

        logger.info("Phase 1 Complete!")
        return self.stats


# =============================================================================
# CLI ENTRY POINT
# =============================================================================

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="AIVA Creator Mind Absorber")
    parser.add_argument("--phase", type=int, default=1, help="Phase to run (1-5)")
    parser.add_argument("--resume", action="store_true", help="Resume from checkpoint")
    args = parser.parse_args()

    absorber = CreatorMindAbsorber()

    if args.phase == 1:
        stats = absorber.run_phase1_extraction()
        print(f"\nPhase 1 Complete!")
        print(f"Processed: {stats.processed_conversations} conversations")
        print(f"Messages: {stats.total_messages}")
        print(f"Human words: {stats.total_human_words:,}")
        print(f"Assistant words: {stats.total_assistant_words:,}")
