#!/usr/bin/env python3
"""
Parse the raw extracted Gemini chat 8e2a6293b063b7fe into clean structured format.
Takes the HTML file and produces a clean labeled conversation.
"""

import re
import sys
from pathlib import Path
from html.parser import HTMLParser

CHAT_ID = "8e2a6293b063b7fe"
OUTPUT_DIR = "/mnt/e/genesis-system/Conversations"
HTML_PATH = f"{OUTPUT_DIR}/gemini_chat_{CHAT_ID}.html"
OUTPUT_PATH = f"{OUTPUT_DIR}/gemini_chat_{CHAT_ID}_RAW.md"


class GeminiParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_user_query = False
        self.in_model_response = False
        self.current_text = []
        self.user_turns = []
        self.model_turns = []
        self.depth = 0
        self.target_tag = None
        self.target_depth = 0
        self.buffer = []

    def handle_starttag(self, tag, attrs):
        attr_dict = dict(attrs)
        classes = attr_dict.get('class', '')

        if tag == 'user-query' or 'user-query' in classes:
            self.in_user_query = True
            self.in_model_response = False
            self.current_text = []

        elif tag == 'model-response' or 'model-response' in classes:
            self.in_model_response = True
            self.in_user_query = False
            self.current_text = []

    def handle_endtag(self, tag):
        if tag == 'user-query' and self.in_user_query:
            text = ' '.join(self.current_text).strip()
            if text:
                self.user_turns.append(text)
            self.in_user_query = False
            self.current_text = []

        elif tag == 'model-response' and self.in_model_response:
            text = ' '.join(self.current_text).strip()
            if text:
                self.model_turns.append(text)
            self.in_model_response = False
            self.current_text = []

    def handle_data(self, data):
        if self.in_user_query or self.in_model_response:
            stripped = data.strip()
            if stripped:
                self.current_text.append(stripped)


def parse_from_html():
    """Parse conversation from the raw HTML file."""
    html_path = Path(HTML_PATH)
    if not html_path.exists():
        print(f"HTML file not found: {HTML_PATH}")
        return None

    with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
        html = f.read()

    print(f"HTML size: {len(html)} chars")

    # Extract user-query blocks
    user_pattern = re.compile(
        r'<user-query[^>]*>(.*?)</user-query>',
        re.DOTALL | re.IGNORECASE
    )

    # Extract model-response blocks
    model_pattern = re.compile(
        r'<model-response[^>]*>(.*?)</model-response>',
        re.DOTALL | re.IGNORECASE
    )

    def strip_tags(html_str):
        """Remove HTML tags and decode entities."""
        # Remove script and style blocks
        clean = re.sub(r'<script[^>]*>.*?</script>', '', html_str, flags=re.DOTALL)
        clean = re.sub(r'<style[^>]*>.*?</style>', '', clean, flags=re.DOTALL)
        # Remove all tags
        clean = re.sub(r'<[^>]+>', ' ', clean)
        # Decode common HTML entities
        clean = clean.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
        clean = clean.replace('&quot;', '"').replace('&#39;', "'").replace('&nbsp;', ' ')
        clean = clean.replace('&#x27;', "'").replace('&#x2F;', '/')
        # Remove unicode escapes
        clean = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), clean)
        # Collapse whitespace
        clean = re.sub(r'\n{3,}', '\n\n', clean)
        clean = re.sub(r' {2,}', ' ', clean)
        return clean.strip()

    user_turns = [strip_tags(m.group(1)) for m in user_pattern.finditer(html)]
    model_turns = [strip_tags(m.group(1)) for m in model_pattern.finditer(html)]

    print(f"Found {len(user_turns)} user turns, {len(model_turns)} model turns")

    return user_turns, model_turns


def parse_from_raw_md():
    """
    Parse from the raw extracted .md file which has selector-based content.
    Returns user turns and model turns as lists.
    """
    raw_path = Path(OUTPUT_PATH)
    if not raw_path.exists():
        print(f"Raw MD not found: {OUTPUT_PATH}")
        return None, None

    with open(raw_path, 'r', encoding='utf-8', errors='replace') as f:
        content = f.read()

    # Find the user-query section
    user_section_match = re.search(
        r'### \[user-query\](.*?)(?=### \[|\Z)',
        content,
        re.DOTALL
    )
    model_section_match = re.search(
        r'### \[model-response\](.*?)(?=### \[|\Z)',
        content,
        re.DOTALL
    )

    def extract_turns(section_text):
        turns = []
        if not section_text:
            return turns
        turn_matches = re.findall(r'--- Turn \d+ ---\n(.*?)(?=--- Turn |\Z)', section_text, re.DOTALL)
        for t in turn_matches:
            text = t.strip()
            if text:
                turns.append(text)
        return turns

    user_turns = extract_turns(user_section_match.group(1)) if user_section_match else []
    model_turns = extract_turns(model_section_match.group(1)) if model_section_match else []

    print(f"From raw MD: {len(user_turns)} user turns, {len(model_turns)} model turns")
    return user_turns, model_turns


def build_structured_conversation(user_turns, model_turns):
    """
    Build a clean labeled conversation from user and model turn lists.
    Interleaves user and model turns.
    """
    output = f"# Gemini Chat Extraction: {CHAT_ID}\n"
    output += f"# URL: https://gemini.google.com/app/{CHAT_ID}\n"
    output += f"# Extracted: 2026-02-20\n"
    output += f"# Turns: {len(user_turns)} user + {len(model_turns)} model\n\n"
    output += "---\n\n"

    max_turns = max(len(user_turns), len(model_turns))
    for i in range(max_turns):
        if i < len(user_turns):
            output += f"## USER TURN {i+1}\n\n"
            output += user_turns[i].strip() + "\n\n"

        if i < len(model_turns):
            output += f"## GEMINI TURN {i+1}\n\n"
            output += model_turns[i].strip() + "\n\n"

        output += "---\n\n"

    return output


def main():
    print("=" * 60)
    print(f"PARSING GEMINI CHAT: {CHAT_ID}")
    print("=" * 60)

    # Try HTML parsing first (most accurate)
    result = parse_from_html()
    if result and result[0] and result[1]:
        user_turns, model_turns = result
        print(f"Using HTML parse: {len(user_turns)} user, {len(model_turns)} model turns")
    else:
        # Fall back to raw MD parsing
        print("HTML parse incomplete, trying raw MD...")
        user_turns, model_turns = parse_from_raw_md()

    if not user_turns and not model_turns:
        print("[FAILURE] Could not extract turns from either source")
        sys.exit(1)

    # Build clean output
    clean_conversation = build_structured_conversation(user_turns, model_turns)

    # Save
    clean_path = f"{OUTPUT_DIR}/gemini_chat_{CHAT_ID}_CLEAN.md"
    with open(clean_path, 'w', encoding='utf-8') as f:
        f.write(clean_conversation)
    print(f"\n[SAVED] {clean_path}")

    # Also update the RAW file with the clean version appended
    with open(OUTPUT_PATH, 'a', encoding='utf-8') as f:
        f.write(f"\n\n---\n\n# CLEAN STRUCTURED CONVERSATION (APPENDED)\n\n")
        f.write(clean_conversation)
    print(f"[APPENDED] Clean version to {OUTPUT_PATH}")

    word_count = len(clean_conversation.split())
    print(f"\n[SUCCESS] Clean conversation: {word_count} words, {len(user_turns)} user turns, {len(model_turns)} model turns")
    print(f"Output: {clean_path}")

    # Print a preview
    print("\n--- PREVIEW (first 2000 chars) ---")
    print(clean_conversation[:2000])


if __name__ == "__main__":
    main()
