#!/usr/bin/env python3
"""
Genesis Bloodstream Loader
============================
Loads ALL extracted knowledge (Deep Think knowledge + session axioms +
Research reports) into the Genesis Bloodstream infrastructure:

  1. PostgreSQL (Elestio) — bloodstream_knowledge table
  2. Qdrant (Elestio) — semantic vector embeddings (genesis_memories collection)
  3. Redis — top 50 highest-confidence items cached as hot knowledge

Sources:
  dt       — Deep Think JSONL knowledge from data/dt_knowledge/
  sessions — Session axiom JSONL files from data/session_axioms/
  reports  — Research reports MD/TXT/JSONL from "Research reports/" directory

Usage:
    python3 bloodstream_loader.py                        # Load everything (dt + sessions + reports)
    python3 bloodstream_loader.py --dry-run              # Show stats without loading
    python3 bloodstream_loader.py --source dt             # Only DT knowledge
    python3 bloodstream_loader.py --source sessions       # Only session axioms
    python3 bloodstream_loader.py --source reports        # Only Research reports → Qdrant
    python3 bloodstream_loader.py --reset                 # Drop and recreate table
    python3 bloodstream_loader.py --skip-redis            # Skip Redis hot cache
    python3 bloodstream_loader.py --skip-qdrant           # Skip Qdrant upsert for reports
"""

import hashlib
import json
import os
import sys
import argparse
import time
from pathlib import Path
from datetime import datetime

# ── Paths ────────────────────────────────────────────────────────────────────
DT_KNOWLEDGE_DIR = "/mnt/e/genesis-system/data/dt_knowledge"
SESSION_AXIOMS_DIR = "/mnt/e/genesis-system/data/session_axioms"
FALLBACK_OUTPUT = "/mnt/e/genesis-system/data/bloodstream_fallback.jsonl"

# Research reports — Windows path, also accept /mnt/e/ variant
_REPORTS_CANDIDATES = [
    r"E:\genesis-system\Research reports",
    "/mnt/e/genesis-system/Research reports",
    r"E:/genesis-system/Research reports",
]
RESEARCH_REPORTS_DIR = next(
    (p for p in _REPORTS_CANDIDATES if Path(p).exists()),
    _REPORTS_CANDIDATES[0],  # fallback, will warn if missing
)

# Qdrant collection for semantic research report vectors
RESEARCH_REPORTS_QDRANT_COLLECTION = "genesis_memories"
RESEARCH_REPORTS_VECTOR_DIM = 384   # all-MiniLM-L6-v2

# Chunk size for large markdown reports (chars)
REPORT_CHUNK_SIZE = 1500
REPORT_CHUNK_OVERLAP = 200

# Add elestio config to path
sys.path.insert(0, "/mnt/e/genesis-system/data/genesis-memory")

# ── Table schema ─────────────────────────────────────────────────────────────
CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS bloodstream_knowledge (
    id SERIAL PRIMARY KEY,
    source TEXT NOT NULL,
    type TEXT NOT NULL,
    title TEXT NOT NULL,
    content TEXT NOT NULL,
    tags TEXT[] DEFAULT '{}',
    confidence FLOAT DEFAULT 0.8,
    embedding_id TEXT,
    embedding_text TEXT,
    metadata JSONB DEFAULT '{}',
    created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Indexes for fast querying
CREATE INDEX IF NOT EXISTS idx_bk_source ON bloodstream_knowledge(source);
CREATE INDEX IF NOT EXISTS idx_bk_type ON bloodstream_knowledge(type);
CREATE INDEX IF NOT EXISTS idx_bk_confidence ON bloodstream_knowledge(confidence DESC);
CREATE INDEX IF NOT EXISTS idx_bk_tags ON bloodstream_knowledge USING GIN(tags);
CREATE INDEX IF NOT EXISTS idx_bk_content_search ON bloodstream_knowledge USING GIN(to_tsvector('english', content));
CREATE INDEX IF NOT EXISTS idx_bk_title_search ON bloodstream_knowledge USING GIN(to_tsvector('english', title));
"""

DROP_TABLE_SQL = """
DROP TABLE IF EXISTS bloodstream_knowledge CASCADE;
"""


def collect_dt_knowledge() -> list[dict]:
    """Collect all extracted DT knowledge JSONL files."""
    items = []
    dt_path = Path(DT_KNOWLEDGE_DIR)

    if not dt_path.exists():
        print(f"  WARNING: DT knowledge dir not found: {DT_KNOWLEDGE_DIR}")
        print(f"  Run dt_knowledge_extractor.py first.")
        return items

    for jsonl_file in sorted(dt_path.glob("*_knowledge.jsonl")):
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    item = json.loads(line)
                    # Normalize to bloodstream schema
                    items.append({
                        "source": item.get("source", jsonl_file.stem),
                        "type": item.get("type", "UNKNOWN"),
                        "title": item.get("title", "Untitled"),
                        "content": item.get("content", item.get("insight", "")),
                        "tags": item.get("tags", []),
                        "confidence": float(item.get("confidence", 0.8)),
                        "metadata": {
                            k: v for k, v in item.items()
                            if k not in ("source", "type", "title", "content",
                                         "insight", "tags", "confidence")
                        },
                    })
                except (json.JSONDecodeError, ValueError) as e:
                    print(f"    WARN: Bad JSON in {jsonl_file.name}:{line_num} — {e}")

    return items


def collect_session_axioms() -> list[dict]:
    """Collect all session axiom JSONL files."""
    items = []
    axioms_path = Path(SESSION_AXIOMS_DIR)

    if not axioms_path.exists():
        print(f"  WARNING: Session axioms dir not found: {SESSION_AXIOMS_DIR}")
        return items

    for session_dir in sorted(axioms_path.iterdir()):
        if not session_dir.is_dir():
            continue

        session_id = session_dir.name

        for jsonl_file in sorted(session_dir.glob("*.jsonl")):
            with open(jsonl_file, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        item = json.loads(line)
                        # Normalize to bloodstream schema
                        items.append({
                            "source": f"session_{session_id}",
                            "type": item.get("type", "STRATEGIC_INSIGHT"),
                            "title": item.get("title", "Untitled"),
                            "content": item.get("insight", item.get("content", "")),
                            "tags": item.get("tags", []),
                            "confidence": float(item.get("confidence", 0.8)),
                            "metadata": {
                                k: v for k, v in item.items()
                                if k not in ("type", "title", "insight", "content",
                                             "tags", "confidence")
                            },
                        })
                    except (json.JSONDecodeError, ValueError):
                        continue

    return items


def _chunk_text(text: str, chunk_size: int = REPORT_CHUNK_SIZE, overlap: int = REPORT_CHUNK_OVERLAP) -> list[str]:
    """Split text into overlapping chunks for embedding."""
    if len(text) <= chunk_size:
        return [text]
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        # Prefer to break at a newline
        if end < len(text):
            newline_pos = text.rfind('\n', start + overlap, end)
            if newline_pos > start + overlap:
                end = newline_pos
        chunks.append(text[start:end].strip())
        start = end - overlap
        if start >= len(text):
            break
    return [c for c in chunks if len(c) > 50]


def collect_research_reports() -> list[dict]:
    """Collect all Research reports (MD, TXT, JSONL) for vector ingestion.

    Returns list of dicts with keys: source, title, content, tags, confidence, metadata.
    Large files are chunked into REPORT_CHUNK_SIZE segments.
    Dedup is done by MD5 hash of the source file path (not content, to handle edits).
    """
    items = []
    reports_path = Path(RESEARCH_REPORTS_DIR)

    if not reports_path.exists():
        print(f"  WARNING: Research reports dir not found: {RESEARCH_REPORTS_DIR}")
        return items

    # Supported extensions
    SUPPORTED = {".md", ".txt", ".jsonl"}

    # Skip Word temp files (start with ~$)
    report_files = sorted(
        f for f in reports_path.iterdir()
        if f.is_file()
        and f.suffix.lower() in SUPPORTED
        and not f.name.startswith("~$")
    )

    print(f"  Found {len(report_files)} research report files in {RESEARCH_REPORTS_DIR}")

    for report_file in report_files:
        try:
            text = report_file.read_text(encoding="utf-8", errors="replace").strip()
        except Exception as e:
            print(f"    WARN: Could not read {report_file.name} — {e}")
            continue

        if not text:
            continue

        # JSONL files: each line is a separate knowledge item
        if report_file.suffix.lower() == ".jsonl":
            for line_num, line in enumerate(text.splitlines(), 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                    content = obj.get("content", obj.get("insight", obj.get("description", "")))
                    if not content:
                        continue
                    items.append({
                        "source": f"report_{report_file.stem}",
                        "type": obj.get("type", "RESEARCH_REPORT"),
                        "title": obj.get("title", obj.get("id", report_file.stem)),
                        "content": content[:REPORT_CHUNK_SIZE],
                        "tags": obj.get("tags", ["research_report", report_file.stem]),
                        "confidence": float(obj.get("confidence", 0.85)),
                        "metadata": {
                            "file": report_file.name,
                            "line": line_num,
                            "source_type": "jsonl",
                        },
                    })
                except (json.JSONDecodeError, ValueError):
                    continue
            continue

        # MD/TXT files: extract title from first H1 or filename
        title = report_file.stem.replace("_", " ").replace("-", " ")
        lines = text.splitlines()
        for line in lines[:5]:
            stripped = line.strip()
            if stripped.startswith("# "):
                title = stripped[2:].strip()
                break

        # Tags from filename keywords
        tags = ["research_report"]
        stem_lower = report_file.stem.lower()
        for keyword in ["voice", "aiva", "ghl", "receptionist", "tradie", "pricing",
                        "strategy", "marketing", "telnyx", "qdrant", "memory", "agent",
                        "shopify", "carttalk", "plumber", "electrician", "australian"]:
            if keyword in stem_lower:
                tags.append(keyword)

        # Chunk the document
        chunks = _chunk_text(text)
        for chunk_idx, chunk in enumerate(chunks):
            chunk_id = hashlib.md5(f"{report_file.name}:{chunk_idx}".encode()).hexdigest()[:12]
            items.append({
                "source": f"report_{report_file.stem}",
                "type": "RESEARCH_REPORT",
                "title": title if chunk_idx == 0 else f"{title} (chunk {chunk_idx + 1})",
                "content": chunk,
                "tags": tags,
                "confidence": 0.85,
                "metadata": {
                    "file": report_file.name,
                    "chunk_index": chunk_idx,
                    "total_chunks": len(chunks),
                    "chunk_id": chunk_id,
                    "source_type": "markdown" if report_file.suffix.lower() == ".md" else "text",
                },
            })

    print(f"  Produced {len(items)} report chunks/items from {len(report_files)} files")
    return items


def upsert_reports_to_qdrant(items: list[dict]) -> bool:
    """Upsert research report vectors into Qdrant genesis_memories collection.

    Uses sentence-transformers all-MiniLM-L6-v2 for 384-dim embeddings.
    Deduplicates by chunk_id (stored in payload). Skips items that already exist.

    Returns True on success, False on failure.
    """
    try:
        from qdrant_client import QdrantClient, models as qdrant_models
        from elestio_config import QdrantConfig
    except ImportError as e:
        print(f"  ERROR: Missing qdrant_client or elestio_config: {e}")
        print("  Install with: pip install qdrant-client")
        return False

    try:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer("all-MiniLM-L6-v2")
        print("  Embedding model loaded: all-MiniLM-L6-v2 (384-dim)")
    except ImportError:
        print("  ERROR: sentence-transformers not installed.")
        print("  Install with: pip install sentence-transformers")
        return False

    try:
        cfg = QdrantConfig.get_connection_params()
        client = QdrantClient(
            url=cfg.get("url", cfg.get("host", "localhost")),
            api_key=cfg.get("api_key"),
            timeout=30,
        )

        # Ensure collection exists with correct dimensions
        existing_collections = [c.name for c in client.get_collections().collections]
        if RESEARCH_REPORTS_QDRANT_COLLECTION not in existing_collections:
            print(f"  Creating Qdrant collection: {RESEARCH_REPORTS_QDRANT_COLLECTION}")
            client.create_collection(
                collection_name=RESEARCH_REPORTS_QDRANT_COLLECTION,
                vectors_config=qdrant_models.VectorParams(
                    size=RESEARCH_REPORTS_VECTOR_DIM,
                    distance=qdrant_models.Distance.COSINE,
                ),
            )
        else:
            print(f"  Qdrant collection exists: {RESEARCH_REPORTS_QDRANT_COLLECTION}")

        # Fetch existing chunk_ids to avoid re-embedding
        existing_chunk_ids: set[str] = set()
        try:
            scroll_result, _ = client.scroll(
                collection_name=RESEARCH_REPORTS_QDRANT_COLLECTION,
                scroll_filter=qdrant_models.Filter(
                    must=[qdrant_models.FieldCondition(
                        key="source_type",
                        match=qdrant_models.MatchAny(any=["markdown", "text", "jsonl"]),
                    )]
                ),
                limit=10000,
                with_payload=True,
                with_vectors=False,
            )
            for point in scroll_result:
                cid = point.payload.get("chunk_id") or point.payload.get("metadata", {}).get("chunk_id")
                if cid:
                    existing_chunk_ids.add(cid)
            print(f"  Existing Qdrant report vectors: {len(existing_chunk_ids)}")
        except Exception:
            pass  # No existing points, proceed

        # Filter to new items only
        new_items = []
        for item in items:
            chunk_id = item.get("metadata", {}).get("chunk_id", "")
            if chunk_id and chunk_id in existing_chunk_ids:
                continue
            new_items.append(item)

        if not new_items:
            print("  No new report vectors to upsert — all already in Qdrant.")
            return True

        print(f"  Embedding {len(new_items)} new chunks...")

        # Batch embed
        EMBED_BATCH = 64
        all_points = []
        import uuid as uuid_mod

        for batch_start in range(0, len(new_items), EMBED_BATCH):
            batch = new_items[batch_start: batch_start + EMBED_BATCH]
            texts = [build_embedding_text(item) for item in batch]
            embeddings = model.encode(texts, show_progress_bar=False).tolist()

            for item, vector in zip(batch, embeddings):
                point_id = str(uuid_mod.uuid4())
                payload = {
                    "source": item["source"],
                    "type": item["type"],
                    "title": item["title"],
                    "content": item["content"],
                    "tags": item.get("tags", []),
                    "confidence": item.get("confidence", 0.85),
                    "created_at": datetime.now().isoformat(),
                    **item.get("metadata", {}),
                }
                all_points.append(qdrant_models.PointStruct(
                    id=point_id,
                    vector=vector,
                    payload=payload,
                ))

            done = min(batch_start + EMBED_BATCH, len(new_items))
            if done % 256 == 0 or done == len(new_items):
                print(f"    Embedded {done}/{len(new_items)}...")

        # Upsert in batches of 200
        UPSERT_BATCH = 200
        total_upserted = 0
        for batch_start in range(0, len(all_points), UPSERT_BATCH):
            chunk = all_points[batch_start: batch_start + UPSERT_BATCH]
            client.upsert(
                collection_name=RESEARCH_REPORTS_QDRANT_COLLECTION,
                points=chunk,
            )
            total_upserted += len(chunk)

        print(f"  Upserted {total_upserted} vectors to Qdrant [{RESEARCH_REPORTS_QDRANT_COLLECTION}]")
        return True

    except Exception as e:
        print(f"  ERROR upserting to Qdrant: {e}")
        return False


def build_embedding_text(item: dict) -> str:
    """Build a text string optimized for embedding generation."""
    parts = [
        f"[{item['type']}]",
        item['title'],
        item['content'],
    ]
    if item.get('tags'):
        parts.append("Tags: " + ", ".join(item['tags']))
    return " | ".join(parts)


def load_to_postgres(items: list[dict], reset: bool = False) -> bool:
    """Load items into PostgreSQL bloodstream_knowledge table.

    Returns True if successful, False if connection failed.
    """
    try:
        import psycopg2
        import psycopg2.extras
        from elestio_config import PostgresConfig
    except ImportError as e:
        print(f"  ERROR: Missing dependency: {e}")
        print(f"  Install with: pip install psycopg2-binary")
        return False

    try:
        conn = psycopg2.connect(**PostgresConfig.get_connection_params())
        conn.autocommit = False
        cur = conn.cursor()

        if reset:
            print("  Dropping existing table...")
            cur.execute(DROP_TABLE_SQL)
            conn.commit()

        # Create table and indexes
        print("  Creating table and indexes...")
        cur.execute(CREATE_TABLE_SQL)
        conn.commit()

        # Check existing count
        cur.execute("SELECT COUNT(*) FROM bloodstream_knowledge")
        existing_count = cur.fetchone()[0]
        print(f"  Existing rows: {existing_count}")

        # Deduplicate: skip items whose (source, title, type) already exist
        if existing_count > 0 and not reset:
            cur.execute(
                "SELECT source, title, type FROM bloodstream_knowledge"
            )
            existing_keys = set()
            for row in cur.fetchall():
                existing_keys.add((row[0], row[1], row[2]))
            before = len(items)
            items = [
                i for i in items
                if (i["source"], i["title"], i["type"]) not in existing_keys
            ]
            print(f"  Deduplication: {before} -> {len(items)} new items")

        if not items:
            print("  No new items to insert.")
            cur.close()
            conn.close()
            return True

        # Batch insert
        print(f"  Inserting {len(items)} items...")
        insert_sql = """
            INSERT INTO bloodstream_knowledge
                (source, type, title, content, tags, confidence, embedding_text, metadata)
            VALUES
                (%s, %s, %s, %s, %s, %s, %s, %s)
        """

        batch = []
        for item in items:
            embedding_text = build_embedding_text(item)
            batch.append((
                item["source"],
                item["type"],
                item["title"],
                item["content"],
                item["tags"],
                item["confidence"],
                embedding_text,
                json.dumps(item.get("metadata", {})),
            ))

        # Insert in batches of 500
        batch_size = 500
        inserted = 0
        for i in range(0, len(batch), batch_size):
            chunk = batch[i:i + batch_size]
            psycopg2.extras.execute_batch(cur, insert_sql, chunk)
            conn.commit()
            inserted += len(chunk)
            if inserted % 1000 == 0 or inserted == len(batch):
                print(f"    Inserted {inserted}/{len(batch)}...")

        # Final count
        cur.execute("SELECT COUNT(*) FROM bloodstream_knowledge")
        final_count = cur.fetchone()[0]
        print(f"  Final row count: {final_count}")

        cur.close()
        conn.close()
        return True

    except Exception as e:
        print(f"  ERROR connecting to PostgreSQL: {e}")
        return False


def load_to_redis(items: list[dict], skip: bool = False) -> bool:
    """Cache top 50 highest-confidence items in Redis for instant retrieval."""
    if skip:
        print("  Skipping Redis (--skip-redis).")
        return True

    try:
        import redis as redis_lib
        from elestio_config import RedisConfig
    except ImportError as e:
        print(f"  ERROR: Missing dependency: {e}")
        return False

    try:
        r = redis_lib.Redis(**RedisConfig.get_connection_params())
        r.ping()

        # Sort by confidence, take top 50
        sorted_items = sorted(items, key=lambda x: -x.get("confidence", 0))
        top_items = sorted_items[:50]

        # Store as a Redis hash: bloodstream:hot:{idx}
        # Also store an index list
        pipe = r.pipeline()

        # Clear old hot cache
        old_keys = r.keys("bloodstream:hot:*")
        if old_keys:
            pipe.delete(*old_keys)
        pipe.delete("bloodstream:hot_index")
        pipe.delete("bloodstream:stats")

        for i, item in enumerate(top_items):
            key = f"bloodstream:hot:{i}"
            pipe.hset(key, mapping={
                "source": item["source"],
                "type": item["type"],
                "title": item["title"],
                "content": item["content"][:2000],  # Truncate for cache
                "tags": json.dumps(item.get("tags", [])),
                "confidence": str(item.get("confidence", 0.8)),
            })
            pipe.expire(key, 86400 * 7)  # 7 day TTL
            pipe.rpush("bloodstream:hot_index", key)

        # Store stats
        type_counts = {}
        for item in items:
            t = item.get("type", "UNKNOWN")
            type_counts[t] = type_counts.get(t, 0) + 1

        pipe.hset("bloodstream:stats", mapping={
            "total_items": str(len(items)),
            "hot_cached": str(len(top_items)),
            "types": json.dumps(type_counts),
            "loaded_at": datetime.now().isoformat(),
        })
        pipe.expire("bloodstream:stats", 86400 * 7)

        pipe.execute()

        print(f"  Cached {len(top_items)} hot items in Redis (7-day TTL)")
        return True

    except Exception as e:
        print(f"  ERROR connecting to Redis: {e}")
        return False


def save_fallback(items: list[dict]):
    """Save all items to local JSONL as fallback when PG is unreachable."""
    os.makedirs(os.path.dirname(FALLBACK_OUTPUT), exist_ok=True)
    with open(FALLBACK_OUTPUT, 'w', encoding='utf-8') as f:
        for item in items:
            item["_pending_pg_load"] = True
            item["_fallback_saved_at"] = datetime.now().isoformat()
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"  Saved {len(items)} items to fallback: {FALLBACK_OUTPUT}")


def print_stats(items: list[dict]):
    """Print detailed statistics about the collected items."""
    type_counts = {}
    source_counts = {}
    tag_counts = {}
    confidence_buckets = {"0.9+": 0, "0.7-0.9": 0, "0.5-0.7": 0, "<0.5": 0}

    for item in items:
        t = item.get("type", "UNKNOWN")
        type_counts[t] = type_counts.get(t, 0) + 1

        s = item.get("source", "unknown")
        # Group by prefix for readability
        if s.startswith("session_"):
            s = "sessions (combined)"
        source_counts[s] = source_counts.get(s, 0) + 1

        for tag in item.get("tags", []):
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

        c = item.get("confidence", 0.8)
        if c >= 0.9:
            confidence_buckets["0.9+"] += 1
        elif c >= 0.7:
            confidence_buckets["0.7-0.9"] += 1
        elif c >= 0.5:
            confidence_buckets["0.5-0.7"] += 1
        else:
            confidence_buckets["<0.5"] += 1

    print(f"\n  --- Knowledge Statistics ---")
    print(f"  Total items: {len(items)}")
    print(f"\n  By Type:")
    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"    {t:35s} : {c:5d}")
    print(f"\n  By Source:")
    for s, c in sorted(source_counts.items(), key=lambda x: -x[1])[:15]:
        print(f"    {s:35s} : {c:5d}")
    if len(source_counts) > 15:
        print(f"    ... and {len(source_counts) - 15} more sources")
    print(f"\n  By Confidence:")
    for bucket, c in confidence_buckets.items():
        print(f"    {bucket:15s} : {c:5d}")
    print(f"\n  Top 15 Tags:")
    for tag, c in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]:
        print(f"    {tag:20s} : {c:5d}")


def main():
    parser = argparse.ArgumentParser(
        description="Genesis Bloodstream Loader — load knowledge into PG/Qdrant/Redis"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Collect and show stats, but don't load into databases",
    )
    parser.add_argument(
        "--source",
        choices=["dt", "sessions", "reports", "all"],
        default="all",
        help="Which source to load (default: all — includes dt, sessions, reports)",
    )
    parser.add_argument(
        "--reset",
        action="store_true",
        help="Drop and recreate the bloodstream_knowledge table",
    )
    parser.add_argument(
        "--skip-redis",
        action="store_true",
        help="Skip loading hot cache to Redis",
    )
    parser.add_argument(
        "--skip-qdrant",
        action="store_true",
        help="Skip Qdrant vector upsert for Research reports",
    )

    args = parser.parse_args()

    print(f"\n{'=' * 70}")
    print(f"  GENESIS BLOODSTREAM LOADER")
    print(f"{'=' * 70}")
    print(f"  Timestamp   : {datetime.now().isoformat()}")
    print(f"  Source      : {args.source}")
    print(f"  Dry run     : {args.dry_run}")
    print(f"  Reset       : {args.reset}")
    print(f"  Skip Qdrant : {args.skip_qdrant}")
    print(f"{'=' * 70}\n")

    # ── Collect all knowledge ─────────────────────────────────────────────
    all_items = []
    report_items = []  # kept separate for Qdrant upsert path

    if args.source in ("dt", "all"):
        print("Collecting DT knowledge...")
        dt_items = collect_dt_knowledge()
        print(f"  Found {len(dt_items)} DT knowledge items")
        all_items.extend(dt_items)

    if args.source in ("sessions", "all"):
        print("Collecting session axioms...")
        session_items = collect_session_axioms()
        print(f"  Found {len(session_items)} session axioms")
        all_items.extend(session_items)

    if args.source in ("reports", "all"):
        print(f"\nCollecting Research reports from: {RESEARCH_REPORTS_DIR}")
        report_items = collect_research_reports()
        # Also add to all_items so they appear in PG + stats
        all_items.extend(report_items)

    if not all_items:
        print("\nNo items to load. Run dt_knowledge_extractor.py and/or session_ingestion_swarm.py first.")
        sys.exit(0)

    print_stats(all_items)

    if args.dry_run:
        print(f"\nDRY RUN — no data loaded. Use without --dry-run to load.")
        if report_items:
            print(f"\nResearch reports ready: {len(report_items)} chunks from Research reports/")
            print(f"Re-run without --dry-run to upsert to Qdrant [{RESEARCH_REPORTS_QDRANT_COLLECTION}]")
        return

    # ── Load to PostgreSQL ────────────────────────────────────────────────
    print(f"\n{'─' * 50}")
    print("Loading to PostgreSQL (Elestio)...")
    pg_ok = load_to_postgres(all_items, reset=args.reset)

    if not pg_ok:
        print("\n  PostgreSQL load FAILED — saving to local fallback JSONL.")
        save_fallback(all_items)
        print(f"  Re-run with working PG connection to load from fallback.")

    # ── Load Research Reports to Qdrant ───────────────────────────────────
    qdrant_ok = True
    if report_items and not args.skip_qdrant:
        print(f"\n{'─' * 50}")
        print(f"Upserting {len(report_items)} report chunks to Qdrant [{RESEARCH_REPORTS_QDRANT_COLLECTION}]...")
        qdrant_ok = upsert_reports_to_qdrant(report_items)
        if not qdrant_ok:
            print("  Qdrant upsert FAILED. Embedding_text is stored in PG as fallback.")
    elif args.skip_qdrant:
        print(f"\n  Qdrant skipped (--skip-qdrant). Reports loaded to PG only.")

    # ── Load to Redis ─────────────────────────────────────────────────────
    print(f"\n{'─' * 50}")
    print("Loading hot cache to Redis (Elestio)...")
    redis_ok = load_to_redis(all_items, skip=args.skip_redis)

    # ── Summary ───────────────────────────────────────────────────────────
    print(f"\n{'=' * 70}")
    print(f"  BLOODSTREAM LOAD COMPLETE")
    print(f"{'=' * 70}")
    print(f"  Total items      : {len(all_items)}")
    print(f"  Report chunks    : {len(report_items)} (Research reports/)")
    print(f"  PostgreSQL       : {'LOADED' if pg_ok else 'FAILED (fallback saved)'}")
    print(f"  Qdrant (reports) : {'LOADED' if (qdrant_ok and report_items) else ('SKIPPED' if args.skip_qdrant or not report_items else 'FAILED')}")
    print(f"  Redis hot cache  : {'LOADED' if redis_ok else 'FAILED'}")
    print(f"{'=' * 70}")


if __name__ == "__main__":
    main()
