#!/usr/bin/env python3
"""
Extract ReceptionistAI swarm artifacts from JSONL results.

Filters for R-* story IDs, extracts code/content, and organizes into
RECEPTIONISTAI/swarm_generated/ directory structure.
"""

import json
import re
import os
from pathlib import Path
from typing import Dict, List, Tuple

# Base directories
BASE_DIR = Path("/mnt/e/genesis-system")
SWARM_RESULTS = BASE_DIR / "hive/swarm_results"
OUTPUT_BASE = BASE_DIR / "RECEPTIONISTAI/swarm_generated"

# Story categorization rules
CATEGORY_MAPPING = {
    # Server/API code
    r'(API|Server|Backend|Endpoint|Route|Database|Schema|Migration|Auth|Middleware)': 'server',
    # Frontend/Widget code
    r'(Widget|Frontend|UI|Component|Dashboard|Admin|Interface)': 'widget',
    # Tests
    r'(Test|E2E|Integration|Unit|Validation)': 'tests',
    # Marketing/Content
    r'(Marketing|Email|Landing|Copy|Sequence|Campaign|Content|FAQ|Pricing)': 'marketing',
    # Documentation
    r'(README|Documentation|Guide|Tutorial|Setup)': 'docs',
    # Config/Deploy
    r'(Deploy|Config|Docker|Nginx|Environment|CI/CD)': 'deploy',
}

def parse_jsonl(filepath: Path) -> List[Dict]:
    """Parse JSONL file and return list of story objects."""
    stories = []
    with open(filepath, 'r') as f:
        for line in f:
            try:
                stories.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"⚠️  JSON parse error: {e}")
                continue
    return stories

def categorize_story(title: str) -> str:
    """Determine category based on story title."""
    for pattern, category in CATEGORY_MAPPING.items():
        if re.search(pattern, title, re.IGNORECASE):
            return category
    return 'misc'

def extract_code_blocks(response: str) -> List[Tuple[str, str, str]]:
    """
    Extract code blocks from markdown response.
    Returns list of (language, filename_hint, code) tuples.
    """
    # Pattern: ```language [optional:filename]\n code \n```
    pattern = r'```(\w+)(?:\s+[:/]?\s*([^\n]+))?\n(.*?)```'
    matches = re.findall(pattern, response, re.DOTALL)

    blocks = []
    for lang, filename_hint, code in matches:
        blocks.append((lang, filename_hint.strip(), code.strip()))

    return blocks

def extract_filename_from_content(code: str, language: str) -> str:
    """Try to infer filename from code content."""
    # Look for common patterns
    patterns = [
        r'#\s*File:\s*([^\n]+)',
        r'//\s*File:\s*([^\n]+)',
        r'<!--\s*File:\s*([^\n]+)\s*-->',
    ]

    for pattern in patterns:
        match = re.search(pattern, code, re.IGNORECASE)
        if match:
            return match.group(1).strip()

    # Default extensions
    ext_map = {
        'python': '.py',
        'javascript': '.js',
        'typescript': '.ts',
        'html': '.html',
        'css': '.css',
        'json': '.json',
        'yaml': '.yml',
        'dockerfile': 'Dockerfile',
        'nginx': '.conf',
        'bash': '.sh',
        'sql': '.sql',
    }

    return f'untitled{ext_map.get(language.lower(), ".txt")}'

def determine_output_path(story_id: str, title: str, filename: str, category: str) -> Path:
    """Determine the full output path for a file."""
    category_dir = OUTPUT_BASE / category

    # Clean filename
    clean_name = filename.replace(':', '_').replace('/', '_')

    return category_dir / clean_name

def process_story(story: Dict, index_entries: List[str]) -> int:
    """Process a single story and extract all artifacts. Returns count of files created."""
    story_id = story['story_id']
    title = story['story_title']
    response = story['response']

    if not response or response.strip() == '':
        print(f"⚠️  {story_id}: Empty response, skipping")
        index_entries.append(f"- **{story_id}**: {title} - ⚠️ EMPTY RESPONSE")
        return 0

    category = categorize_story(title)
    code_blocks = extract_code_blocks(response)

    if not code_blocks:
        # No code blocks found, save as markdown
        category_dir = OUTPUT_BASE / category
        category_dir.mkdir(parents=True, exist_ok=True)

        output_path = category_dir / f"{story_id}.md"
        with open(output_path, 'w') as f:
            f.write(f"# {title}\n\n")
            f.write(f"**Story ID**: {story_id}\n\n")
            f.write(response)

        rel_path = output_path.relative_to(OUTPUT_BASE)
        index_entries.append(f"- **{story_id}**: {title} → `{rel_path}` (markdown)")
        print(f"✅ {story_id}: Saved markdown to {rel_path}")
        return 1

    # Extract code blocks
    files_created = 0
    for i, (lang, filename_hint, code) in enumerate(code_blocks):
        # Determine filename
        if filename_hint:
            filename = filename_hint
        else:
            filename = extract_filename_from_content(code, lang)
            if filename.startswith('untitled'):
                filename = f"{story_id}_{i+1}{filename[8:]}"  # Replace 'untitled' with story_id

        output_path = determine_output_path(story_id, title, filename, category)
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with open(output_path, 'w') as f:
            f.write(code)

        rel_path = output_path.relative_to(OUTPUT_BASE)
        index_entries.append(f"  - `{rel_path}` ({lang})")
        files_created += 1

    # Add story header to index
    index_entries.insert(-files_created, f"- **{story_id}**: {title}")
    print(f"✅ {story_id}: Extracted {files_created} file(s) → {category}/")

    return files_created

def main():
    """Main extraction process."""
    print("🔍 ReceptionistAI Swarm Artifact Extractor")
    print("=" * 60)

    # Create output directory
    OUTPUT_BASE.mkdir(parents=True, exist_ok=True)

    # Parse both JSONL files
    kimi_stories = parse_jsonl(SWARM_RESULTS / "kimi_results.jsonl")
    minimax_stories = parse_jsonl(SWARM_RESULTS / "minimax_results.jsonl")

    all_stories = kimi_stories + minimax_stories

    # Filter for ReceptionistAI stories (R-*)
    r_stories = [s for s in all_stories if s['story_id'].startswith('R-')]

    print(f"\n📊 Found {len(r_stories)} ReceptionistAI stories")
    print(f"   - {len([s for s in kimi_stories if s['story_id'].startswith('R-')])} from Kimi (code)")
    print(f"   - {len([s for s in minimax_stories if s['story_id'].startswith('R-')])} from MiniMax (content)")
    print()

    # Process all stories
    index_entries = [
        "# ReceptionistAI Swarm Generated Artifacts",
        "",
        f"**Generated**: {len(r_stories)} stories extracted from overnight swarm",
        f"**Source**: `/mnt/e/genesis-system/hive/swarm_results/`",
        "",
        "## Artifacts by Story",
        "",
    ]

    total_files = 0
    success_count = 0

    for story in sorted(r_stories, key=lambda x: x['story_id']):
        try:
            files_created = process_story(story, index_entries)
            total_files += files_created
            if files_created > 0:
                success_count += 1
        except Exception as e:
            print(f"❌ {story['story_id']}: ERROR - {e}")
            index_entries.append(f"- **{story['story_id']}**: {story['story_title']} - ❌ ERROR: {e}")

    # Write index
    index_path = OUTPUT_BASE / "INDEX.md"
    with open(index_path, 'w') as f:
        f.write('\n'.join(index_entries))

    print()
    print("=" * 60)
    print(f"✅ EXTRACTION COMPLETE")
    print(f"   - {success_count}/{len(r_stories)} stories processed successfully")
    print(f"   - {total_files} files created")
    print(f"   - Index written to: {index_path.relative_to(BASE_DIR)}")
    print()
    print(f"📁 All artifacts in: {OUTPUT_BASE.relative_to(BASE_DIR)}/")

if __name__ == '__main__':
    main()