#!/usr/bin/env python3
"""
Extract Sunaiva Memory Vault swarm artifacts from JSONL results.
Filters S-* stories and places artifacts in appropriate directories.
"""

import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple

# Base paths
BASE_DIR = Path("/mnt/e/genesis-system")
SUNAIVA_DIR = BASE_DIR / "Sunaiva" / "ai-memory"
SWARM_GEN_DIR = SUNAIVA_DIR / "swarm_generated"
KIMI_RESULTS = BASE_DIR / "hive" / "swarm_results" / "kimi_results.jsonl"
MINIMAX_RESULTS = BASE_DIR / "hive" / "swarm_results" / "minimax_results.jsonl"


def extract_code_blocks(text: str) -> List[Tuple[str, str]]:
    """Extract code blocks with language tags from markdown."""
    pattern = r'```(\w+)?\n(.*?)```'
    matches = re.findall(pattern, text, re.DOTALL)
    return [(lang or 'txt', code.strip()) for lang, code in matches]


def determine_file_path(story_id: str, title: str, content: str) -> Tuple[Path, str]:
    """Determine appropriate file location based on story metadata."""
    title_lower = title.lower()

    # Test files
    if 'test' in title_lower:
        base_dir = SWARM_GEN_DIR / "tests"
        if 'e2e' in title_lower or 'integration' in title_lower:
            return base_dir / "e2e", story_id
        return base_dir, story_id

    # Marketing content
    if any(kw in title_lower for kw in ['email', 'marketing', 'landing', 'copy', 'onboarding']):
        return SWARM_GEN_DIR / "marketing", story_id

    # API/Server code
    if any(kw in title_lower for kw in ['api', 'endpoint', 'route', 'fastapi']):
        return SWARM_GEN_DIR / "server", story_id

    # Auth/Security
    if any(kw in title_lower for kw in ['auth', 'jwt', 'session', 'security']):
        return SWARM_GEN_DIR / "server" / "auth", story_id

    # Billing/Payments
    if any(kw in title_lower for kw in ['billing', 'stripe', 'payment', 'subscription']):
        return SWARM_GEN_DIR / "server" / "billing", story_id

    # Database/Models
    if any(kw in title_lower for kw in ['database', 'model', 'schema', 'migration']):
        return SWARM_GEN_DIR / "server" / "db", story_id

    # MCP Server
    if 'mcp' in title_lower:
        return SWARM_GEN_DIR / "mcp", story_id

    # Frontend/UI
    if any(kw in title_lower for kw in ['frontend', 'ui', 'html', 'css', 'dashboard']):
        return SWARM_GEN_DIR / "frontend", story_id

    # Config files
    if any(kw in title_lower for kw in ['config', 'env', 'docker', 'deploy']):
        return SWARM_GEN_DIR / "config", story_id

    # Default to root of swarm_generated
    return SWARM_GEN_DIR, story_id


def extract_filename_from_content(content: str, story_id: str) -> str:
    """Try to extract a filename from the content or generate one."""
    # Look for file references in content
    file_pattern = r'(?:file|path|save as):\s*([a-zA-Z0-9_\-./]+\.[a-z]{2,4})'
    match = re.search(file_pattern, content, re.IGNORECASE)
    if match:
        return match.group(1).split('/')[-1]

    # Look for class/function names in Python code
    if 'class ' in content or 'def ' in content:
        class_match = re.search(r'class\s+([A-Z][a-zA-Z0-9]*)', content)
        if class_match:
            return f"{class_match.group(1).lower()}.py"

        func_match = re.search(r'def\s+([a-z_][a-z0-9_]*)', content)
        if func_match:
            return f"{func_match.group(1)}.py"

    # Default naming
    return f"{story_id.lower().replace('-', '_')}.txt"


def process_jsonl_file(filepath: Path, source_name: str) -> List[Dict]:
    """Process a JSONL file and extract S-* stories."""
    artifacts = []

    print(f"\nProcessing {source_name}...")

    with open(filepath, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data = json.loads(line.strip())
                story_id = data.get('story_id', '')

                # Filter for Sunaiva stories
                if not story_id.startswith('S-'):
                    continue

                title = data.get('title', 'Untitled')
                response = data.get('response', '')

                if not response:
                    print(f"  ⚠️  {story_id}: No response content, skipping")
                    continue

                artifacts.append({
                    'story_id': story_id,
                    'title': title,
                    'response': response,
                    'source': source_name
                })

                print(f"  ✓ {story_id}: {title}")

            except json.JSONDecodeError as e:
                print(f"  ⚠️  Line {line_num}: JSON decode error: {e}")
                continue
            except Exception as e:
                print(f"  ⚠️  Line {line_num}: Error: {e}")
                continue

    return artifacts


def write_artifacts(artifacts: List[Dict]) -> List[Dict]:
    """Write artifacts to appropriate locations."""
    index_entries = []

    print(f"\nWriting {len(artifacts)} artifacts...")

    for artifact in artifacts:
        story_id = artifact['story_id']
        title = artifact['title']
        response = artifact['response']
        source = artifact['source']

        # Determine target directory
        target_dir, base_name = determine_file_path(story_id, title, response)
        target_dir.mkdir(parents=True, exist_ok=True)

        # Extract code blocks
        code_blocks = extract_code_blocks(response)

        if code_blocks:
            # Write each code block as a separate file
            for idx, (lang, code) in enumerate(code_blocks):
                ext = lang if lang in ['py', 'js', 'ts', 'html', 'css', 'json', 'yaml', 'sh'] else 'txt'
                if len(code_blocks) == 1:
                    filename = f"{base_name}.{ext}"
                else:
                    filename = f"{base_name}_{idx+1}.{ext}"

                filepath = target_dir / filename

                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(code)

                index_entries.append({
                    'story_id': story_id,
                    'title': title,
                    'source': source,
                    'file': str(filepath.relative_to(SUNAIVA_DIR)),
                    'type': 'code',
                    'language': lang
                })

                print(f"  ✓ {story_id} → {filepath.relative_to(SUNAIVA_DIR)}")
        else:
            # No code blocks, write full response as markdown
            filename = extract_filename_from_content(response, story_id)
            if not filename.endswith('.md'):
                filename = f"{base_name}.md"

            filepath = target_dir / filename

            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(f"# {title}\n\n")
                f.write(f"**Story ID**: {story_id}\n")
                f.write(f"**Source**: {source}\n\n")
                f.write("---\n\n")
                f.write(response)

            index_entries.append({
                'story_id': story_id,
                'title': title,
                'source': source,
                'file': str(filepath.relative_to(SUNAIVA_DIR)),
                'type': 'content',
                'language': 'markdown'
            })

            print(f"  ✓ {story_id} → {filepath.relative_to(SUNAIVA_DIR)}")

    return index_entries


def write_index(entries: List[Dict]):
    """Write INDEX.md with all extracted artifacts."""
    SWARM_GEN_DIR.mkdir(parents=True, exist_ok=True)
    index_path = SWARM_GEN_DIR / "INDEX.md"

    # Group by directory
    by_dir = {}
    for entry in entries:
        dir_name = str(Path(entry['file']).parent)
        if dir_name not in by_dir:
            by_dir[dir_name] = []
        by_dir[dir_name].append(entry)

    with open(index_path, 'w', encoding='utf-8') as f:
        f.write("# Sunaiva Memory Vault - Swarm Generated Artifacts\n\n")
        f.write(f"**Total Artifacts**: {len(entries)}\n")
        f.write(f"**Generated**: 2026-02-16\n\n")
        f.write("---\n\n")

        # Summary by source
        kimi_count = sum(1 for e in entries if e['source'] == 'kimi')
        minimax_count = sum(1 for e in entries if e['source'] == 'minimax')
        f.write("## Source Breakdown\n\n")
        f.write(f"- **Kimi (code stories)**: {kimi_count} artifacts\n")
        f.write(f"- **MiniMax (content stories)**: {minimax_count} artifacts\n\n")

        # Summary by type
        code_count = sum(1 for e in entries if e['type'] == 'code')
        content_count = sum(1 for e in entries if e['type'] == 'content')
        f.write("## Type Breakdown\n\n")
        f.write(f"- **Code**: {code_count} files\n")
        f.write(f"- **Content**: {content_count} files\n\n")

        # Artifacts by directory
        f.write("## Artifacts by Directory\n\n")
        for dir_name in sorted(by_dir.keys()):
            f.write(f"### {dir_name}\n\n")
            for entry in sorted(by_dir[dir_name], key=lambda x: x['story_id']):
                f.write(f"- **{entry['story_id']}**: {entry['title']}\n")
                f.write(f"  - File: `{entry['file']}`\n")
                f.write(f"  - Type: {entry['type']} ({entry['language']})\n")
                f.write(f"  - Source: {entry['source']}\n")
                f.write("\n")

        # Full listing
        f.write("## Complete Listing\n\n")
        f.write("| Story ID | Title | File | Type | Language | Source |\n")
        f.write("|----------|-------|------|------|----------|--------|\n")
        for entry in sorted(entries, key=lambda x: x['story_id']):
            f.write(f"| {entry['story_id']} | {entry['title']} | `{entry['file']}` | {entry['type']} | {entry['language']} | {entry['source']} |\n")

    print(f"\n✓ Index written to {index_path.relative_to(SUNAIVA_DIR)}")


def main():
    """Main extraction process."""
    print("=" * 80)
    print("SUNAIVA MEMORY VAULT - SWARM ARTIFACT EXTRACTION")
    print("=" * 80)

    # Process both JSONL files
    kimi_artifacts = process_jsonl_file(KIMI_RESULTS, 'kimi')
    minimax_artifacts = process_jsonl_file(MINIMAX_RESULTS, 'minimax')

    all_artifacts = kimi_artifacts + minimax_artifacts

    if not all_artifacts:
        print("\n⚠️  No Sunaiva (S-*) artifacts found in swarm results.")
        return

    # Write artifacts to disk
    index_entries = write_artifacts(all_artifacts)

    # Write index
    write_index(index_entries)

    # Summary report
    print("\n" + "=" * 80)
    print("EXTRACTION COMPLETE")
    print("=" * 80)
    print(f"Total artifacts extracted: {len(index_entries)}")
    print(f"  - From Kimi (code): {len(kimi_artifacts)}")
    print(f"  - From MiniMax (content): {len(minimax_artifacts)}")
    print(f"\nAll artifacts placed in: {SWARM_GEN_DIR.relative_to(BASE_DIR)}")
    print(f"Index file: {(SWARM_GEN_DIR / 'INDEX.md').relative_to(BASE_DIR)}")
    print("=" * 80)


if __name__ == "__main__":
    main()
