#!/usr/bin/env python3 """ Parse the raw extracted Gemini chat 8e2a6293b063b7fe into clean structured format. Takes the HTML file and produces a clean labeled conversation. """ import re import sys from pathlib import Path from html.parser import HTMLParser CHAT_ID = "8e2a6293b063b7fe" OUTPUT_DIR = "/mnt/e/genesis-system/Conversations" HTML_PATH = f"{OUTPUT_DIR}/gemini_chat_{CHAT_ID}.html" OUTPUT_PATH = f"{OUTPUT_DIR}/gemini_chat_{CHAT_ID}_RAW.md" class GeminiParser(HTMLParser): def __init__(self): super().__init__() self.in_user_query = False self.in_model_response = False self.current_text = [] self.user_turns = [] self.model_turns = [] self.depth = 0 self.target_tag = None self.target_depth = 0 self.buffer = [] def handle_starttag(self, tag, attrs): attr_dict = dict(attrs) classes = attr_dict.get('class', '') if tag == 'user-query' or 'user-query' in classes: self.in_user_query = True self.in_model_response = False self.current_text = [] elif tag == 'model-response' or 'model-response' in classes: self.in_model_response = True self.in_user_query = False self.current_text = [] def handle_endtag(self, tag): if tag == 'user-query' and self.in_user_query: text = ' '.join(self.current_text).strip() if text: self.user_turns.append(text) self.in_user_query = False self.current_text = [] elif tag == 'model-response' and self.in_model_response: text = ' '.join(self.current_text).strip() if text: self.model_turns.append(text) self.in_model_response = False self.current_text = [] def handle_data(self, data): if self.in_user_query or self.in_model_response: stripped = data.strip() if stripped: self.current_text.append(stripped) def parse_from_html(): """Parse conversation from the raw HTML file.""" html_path = Path(HTML_PATH) if not html_path.exists(): print(f"HTML file not found: {HTML_PATH}") return None with open(html_path, 'r', encoding='utf-8', errors='replace') as f: html = f.read() print(f"HTML size: {len(html)} chars") # Extract user-query blocks user_pattern = re.compile( r']*>(.*?)', re.DOTALL | re.IGNORECASE ) # Extract model-response blocks model_pattern = re.compile( r']*>(.*?)', re.DOTALL | re.IGNORECASE ) def strip_tags(html_str): """Remove HTML tags and decode entities.""" # Remove script and style blocks clean = re.sub(r']*>.*?', '', html_str, flags=re.DOTALL) clean = re.sub(r']*>.*?', '', clean, flags=re.DOTALL) # Remove all tags clean = re.sub(r'<[^>]+>', ' ', clean) # Decode common HTML entities clean = clean.replace('&', '&').replace('<', '<').replace('>', '>') clean = clean.replace('"', '"').replace(''', "'").replace(' ', ' ') clean = clean.replace(''', "'").replace('/', '/') # Remove unicode escapes clean = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), clean) # Collapse whitespace clean = re.sub(r'\n{3,}', '\n\n', clean) clean = re.sub(r' {2,}', ' ', clean) return clean.strip() user_turns = [strip_tags(m.group(1)) for m in user_pattern.finditer(html)] model_turns = [strip_tags(m.group(1)) for m in model_pattern.finditer(html)] print(f"Found {len(user_turns)} user turns, {len(model_turns)} model turns") return user_turns, model_turns def parse_from_raw_md(): """ Parse from the raw extracted .md file which has selector-based content. Returns user turns and model turns as lists. """ raw_path = Path(OUTPUT_PATH) if not raw_path.exists(): print(f"Raw MD not found: {OUTPUT_PATH}") return None, None with open(raw_path, 'r', encoding='utf-8', errors='replace') as f: content = f.read() # Find the user-query section user_section_match = re.search( r'### \[user-query\](.*?)(?=### \[|\Z)', content, re.DOTALL ) model_section_match = re.search( r'### \[model-response\](.*?)(?=### \[|\Z)', content, re.DOTALL ) def extract_turns(section_text): turns = [] if not section_text: return turns turn_matches = re.findall(r'--- Turn \d+ ---\n(.*?)(?=--- Turn |\Z)', section_text, re.DOTALL) for t in turn_matches: text = t.strip() if text: turns.append(text) return turns user_turns = extract_turns(user_section_match.group(1)) if user_section_match else [] model_turns = extract_turns(model_section_match.group(1)) if model_section_match else [] print(f"From raw MD: {len(user_turns)} user turns, {len(model_turns)} model turns") return user_turns, model_turns def build_structured_conversation(user_turns, model_turns): """ Build a clean labeled conversation from user and model turn lists. Interleaves user and model turns. """ output = f"# Gemini Chat Extraction: {CHAT_ID}\n" output += f"# URL: https://gemini.google.com/app/{CHAT_ID}\n" output += f"# Extracted: 2026-02-20\n" output += f"# Turns: {len(user_turns)} user + {len(model_turns)} model\n\n" output += "---\n\n" max_turns = max(len(user_turns), len(model_turns)) for i in range(max_turns): if i < len(user_turns): output += f"## USER TURN {i+1}\n\n" output += user_turns[i].strip() + "\n\n" if i < len(model_turns): output += f"## GEMINI TURN {i+1}\n\n" output += model_turns[i].strip() + "\n\n" output += "---\n\n" return output def main(): print("=" * 60) print(f"PARSING GEMINI CHAT: {CHAT_ID}") print("=" * 60) # Try HTML parsing first (most accurate) result = parse_from_html() if result and result[0] and result[1]: user_turns, model_turns = result print(f"Using HTML parse: {len(user_turns)} user, {len(model_turns)} model turns") else: # Fall back to raw MD parsing print("HTML parse incomplete, trying raw MD...") user_turns, model_turns = parse_from_raw_md() if not user_turns and not model_turns: print("[FAILURE] Could not extract turns from either source") sys.exit(1) # Build clean output clean_conversation = build_structured_conversation(user_turns, model_turns) # Save clean_path = f"{OUTPUT_DIR}/gemini_chat_{CHAT_ID}_CLEAN.md" with open(clean_path, 'w', encoding='utf-8') as f: f.write(clean_conversation) print(f"\n[SAVED] {clean_path}") # Also update the RAW file with the clean version appended with open(OUTPUT_PATH, 'a', encoding='utf-8') as f: f.write(f"\n\n---\n\n# CLEAN STRUCTURED CONVERSATION (APPENDED)\n\n") f.write(clean_conversation) print(f"[APPENDED] Clean version to {OUTPUT_PATH}") word_count = len(clean_conversation.split()) print(f"\n[SUCCESS] Clean conversation: {word_count} words, {len(user_turns)} user turns, {len(model_turns)} model turns") print(f"Output: {clean_path}") # Print a preview print("\n--- PREVIEW (first 2000 chars) ---") print(clean_conversation[:2000]) if __name__ == "__main__": main()