
"""
Populate Test Data
Reads local GHL markdown files, embeds them using Config (FastEmbed), and uploads to Qdrant.
"""

import os
import glob
from typing import List
from qdrant_client.models import Distance, VectorParams, PointStruct
from config import config

# CONFIG
COLLECTION_NAME = config.qdrant.collection_name
DATA_DIR = "E:/genesis-system/knowledge-bases/ghl"

def setup_collection(client, vector_size: int):
    try:
        client.recreate_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
        )
        print(f"Created collection '{COLLECTION_NAME}'")
    except Exception as e:
        print(f"Collection setup error: {e}")

def load_documents() -> List[dict]:
    docs = []
    # Recursively find md files
    files = glob.glob(f"{DATA_DIR}/**/*.md", recursive=True)
    
    print(f"Found {len(files)} documents.")
    
    for fpath in files:
        try:
            with open(fpath, "r", encoding="utf-8") as f:
                content = f.read()
                
            # Simple chunking (by paragraph for now)
            chunks = content.split("\n\n")
            
            for i, chunk in enumerate(chunks):
                if len(chunk.strip()) < 20: continue
                
                docs.append({
                    "source": fpath,
                    "chunk_id": i,
                    "content": chunk.strip()
                })
        except Exception as e:
            print(f"Error reading {fpath}: {e}")
            
    return docs

def main():
    # 1. Initialize Client
    client = config.get_qdrant_client()
    
    # 2. Get first embedding to determine size
    print("Testing embedding model (may download on first run)...")
    try:
        sample_vec = config.get_embedding("Hello Genesis")
        vec_size = len(sample_vec)
        print(f"Embedding size: {vec_size}")
    except Exception as e:
        print(f"Failed to get embedding: {e}")
        return

    # 3. Setup Collection
    setup_collection(client, vec_size)
    
    # 4. Process and Upload
    docs = load_documents()
    print(f"Processing {len(docs)} chunks...")
    
    points = []
    total_uploaded = 0
    
    for i, doc in enumerate(docs):
        try:
            vec = config.get_embedding(doc["content"])
            points.append(PointStruct(
                id=i,
                vector=vec,
                payload=doc
            ))
            
            if len(points) >= 20:
                client.upsert(collection_name=COLLECTION_NAME, points=points)
                total_uploaded += len(points)
                points = []
                print(f"Uploaded {total_uploaded}/{len(docs)}")
        except Exception as e:
            print(f"Error processing chunk {i}: {e}")
            
    if points:
        client.upsert(collection_name=COLLECTION_NAME, points=points)
        print("Done.")

if __name__ == "__main__":
    main()
