#!/usr/bin/env python3
"""
Genesis Session Transcript Extractor
=====================================
Extracts human/assistant dialogue from Claude Code JSONL session transcripts.
Strips tool calls, system messages, progress indicators — keeps only the gold.

Usage:
    python3 session_transcript_extractor.py                    # Process all sprint sessions
    python3 session_transcript_extractor.py --session <uuid>   # Process specific session
    python3 session_transcript_extractor.py --since 2026-01-30 # Process sessions since date
    python3 session_transcript_extractor.py --list             # List all sessions with stats
"""

import json
import os
import sys
import glob
import argparse
from datetime import datetime
from pathlib import Path

SESSIONS_DIR = "/home/authentic88/.claude/projects/-mnt-e-genesis-system"
OUTPUT_DIR = "/mnt/e/genesis-system/data/session_extracts"
CHUNK_SIZE = 45000  # ~45K chars ≈ ~12K tokens, leaves room for prompt


def extract_dialogue(jsonl_path: str) -> list[dict]:
    """Extract human/assistant dialogue from a JSONL session transcript."""
    messages = []

    with open(jsonl_path, 'r', encoding='utf-8', errors='replace') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            msg_type = obj.get('type', '')

            if msg_type == 'user':
                content = _extract_content(obj.get('message', {}))
                if content and len(content.strip()) > 5:
                    # Filter out system reminders that appear as user messages
                    if not content.strip().startswith('<system-reminder>'):
                        messages.append({
                            'role': 'human',
                            'content': content.strip(),
                            'line': line_num
                        })

            elif msg_type == 'assistant':
                content = _extract_content(obj.get('message', {}))
                if content and len(content.strip()) > 10:
                    messages.append({
                        'role': 'assistant',
                        'content': content.strip(),
                        'line': line_num
                    })

    return messages


def _extract_content(message: dict) -> str:
    """Extract text content from a message object, ignoring tool use blocks."""
    if not isinstance(message, dict):
        return ''

    content = message.get('content', '')

    if isinstance(content, str):
        return content

    if isinstance(content, list):
        text_parts = []
        for block in content:
            if isinstance(block, dict):
                if block.get('type') == 'text':
                    text = block.get('text', '')
                    # Skip system reminders embedded in text
                    if '<system-reminder>' not in text:
                        text_parts.append(text)
                # Skip tool_use, tool_result, thinking blocks
        return '\n'.join(text_parts)

    return ''


def chunk_dialogue(messages: list[dict], chunk_size: int = CHUNK_SIZE) -> list[str]:
    """Split dialogue into chunks suitable for LLM processing."""
    chunks = []
    current_chunk = []
    current_size = 0

    for msg in messages:
        role_label = "KINAN" if msg['role'] == 'human' else "GENESIS"
        formatted = f"[{role_label}]: {msg['content']}\n\n"
        msg_size = len(formatted)

        if current_size + msg_size > chunk_size and current_chunk:
            chunks.append(''.join(current_chunk))
            # Keep last message for context overlap
            overlap = current_chunk[-1] if current_chunk else ''
            current_chunk = [overlap] if overlap else []
            current_size = len(overlap)

        current_chunk.append(formatted)
        current_size += msg_size

    if current_chunk:
        chunks.append(''.join(current_chunk))

    return chunks


def get_session_info(jsonl_path: str) -> dict:
    """Get metadata about a session file."""
    stat = os.stat(jsonl_path)
    size_mb = stat.st_size / (1024 * 1024)
    modified = datetime.fromtimestamp(stat.st_mtime)
    uuid = Path(jsonl_path).stem

    return {
        'uuid': uuid,
        'path': jsonl_path,
        'size_mb': round(size_mb, 1),
        'modified': modified,
        'date_str': modified.strftime('%Y-%m-%d %H:%M')
    }


def list_sessions(since_date=None):
    """List all session files with stats."""
    pattern = os.path.join(SESSIONS_DIR, "*.jsonl")
    files = glob.glob(pattern)

    sessions = []
    for f in files:
        # Skip agent transcripts
        if 'agent-' in os.path.basename(f):
            continue
        info = get_session_info(f)
        if since_date and info['modified'] < since_date:
            continue
        sessions.append(info)

    sessions.sort(key=lambda x: x['modified'])
    return sessions


def process_session(jsonl_path: str, output_dir: str) -> dict:
    """Process a single session: extract dialogue, chunk, save."""
    info = get_session_info(jsonl_path)
    uuid = info['uuid']

    # Extract dialogue
    messages = extract_dialogue(jsonl_path)
    if not messages:
        return {'uuid': uuid, 'status': 'empty', 'messages': 0, 'chunks': 0}

    # Calculate stats
    human_msgs = sum(1 for m in messages if m['role'] == 'human')
    assistant_msgs = sum(1 for m in messages if m['role'] == 'assistant')
    total_chars = sum(len(m['content']) for m in messages)

    # Chunk for LLM processing
    chunks = chunk_dialogue(messages)

    # Save extracted dialogue
    session_dir = os.path.join(output_dir, uuid[:8])
    os.makedirs(session_dir, exist_ok=True)

    # Save full dialogue
    dialogue_path = os.path.join(session_dir, 'dialogue.json')
    with open(dialogue_path, 'w') as f:
        json.dump({
            'session_uuid': uuid,
            'date': info['date_str'],
            'size_mb': info['size_mb'],
            'human_messages': human_msgs,
            'assistant_messages': assistant_msgs,
            'total_chars': total_chars,
            'messages': messages
        }, f, indent=2, default=str)

    # Save chunks for swarm processing
    for i, chunk in enumerate(chunks):
        chunk_path = os.path.join(session_dir, f'chunk_{i:03d}.txt')
        with open(chunk_path, 'w') as f:
            f.write(chunk)

    # Save manifest
    manifest_path = os.path.join(session_dir, 'manifest.json')
    with open(manifest_path, 'w') as f:
        json.dump({
            'session_uuid': uuid,
            'date': info['date_str'],
            'size_mb': info['size_mb'],
            'human_messages': human_msgs,
            'assistant_messages': assistant_msgs,
            'total_chars': total_chars,
            'estimated_tokens': total_chars // 4,
            'num_chunks': len(chunks),
            'chunk_size': CHUNK_SIZE,
            'chunks': [f'chunk_{i:03d}.txt' for i in range(len(chunks))]
        }, f, indent=2)

    return {
        'uuid': uuid,
        'status': 'processed',
        'messages': len(messages),
        'human': human_msgs,
        'assistant': assistant_msgs,
        'chars': total_chars,
        'tokens_est': total_chars // 4,
        'chunks': len(chunks),
        'output_dir': session_dir
    }


def main():
    parser = argparse.ArgumentParser(description='Extract dialogue from Claude Code sessions')
    parser.add_argument('--session', help='Process specific session UUID')
    parser.add_argument('--since', help='Process sessions since date (YYYY-MM-DD)', default='2026-01-30')
    parser.add_argument('--list', action='store_true', help='List sessions only')
    parser.add_argument('--output', default=OUTPUT_DIR, help='Output directory')
    args = parser.parse_args()

    since_date = datetime.strptime(args.since, '%Y-%m-%d') if args.since else None

    if args.list:
        sessions = list_sessions(since_date)
        print(f"\n{'UUID':>10} | {'Date':>16} | {'Size':>7} | Path")
        print('-' * 80)
        for s in sessions:
            print(f"{s['uuid'][:8]:>10} | {s['date_str']:>16} | {s['size_mb']:>5.1f}MB | {s['path']}")
        print(f"\nTotal: {len(sessions)} sessions, {sum(s['size_mb'] for s in sessions):.1f} MB")
        return

    os.makedirs(args.output, exist_ok=True)

    if args.session:
        # Process single session
        path = os.path.join(SESSIONS_DIR, f"{args.session}.jsonl")
        if not os.path.exists(path):
            print(f"Session not found: {path}")
            sys.exit(1)
        result = process_session(path, args.output)
        print(json.dumps(result, indent=2))
    else:
        # Process all sprint sessions
        sessions = list_sessions(since_date)
        print(f"Processing {len(sessions)} sessions since {args.since}...")

        results = []
        total_chunks = 0
        total_tokens = 0

        for i, session in enumerate(sessions):
            result = process_session(session['path'], args.output)
            results.append(result)
            total_chunks += result.get('chunks', 0)
            total_tokens += result.get('tokens_est', 0)

            status = '✓' if result['status'] == 'processed' else '○'
            print(f"  [{status}] {i+1}/{len(sessions)} | {session['uuid'][:8]} | "
                  f"{session['date_str']} | {session['size_mb']:>5.1f}MB | "
                  f"{result.get('messages', 0)} msgs → {result.get('chunks', 0)} chunks")

        # Save global manifest
        global_manifest = {
            'timestamp': datetime.now().isoformat(),
            'sessions_processed': len(results),
            'total_chunks': total_chunks,
            'total_tokens_est': total_tokens,
            'estimated_cost_minimax': f"${total_tokens * 0.3 / 1_000_000:.2f}",
            'sessions': results
        }

        manifest_path = os.path.join(args.output, 'MANIFEST.json')
        with open(manifest_path, 'w') as f:
            json.dump(global_manifest, f, indent=2)

        print(f"\n{'='*60}")
        print(f"EXTRACTION COMPLETE")
        print(f"Sessions: {len(results)}")
        print(f"Total chunks: {total_chunks}")
        print(f"Estimated tokens: {total_tokens:,}")
        print(f"Estimated MiniMax cost: ${total_tokens * 0.3 / 1_000_000:.2f}")
        print(f"Manifest: {manifest_path}")
        print(f"{'='*60}")


if __name__ == '__main__':
    main()
