"""
Memory Competitor Intelligence Scout — Webhook Processor
=========================================================
Receives POST requests from n8n when new memory tools/frameworks are discovered.
Classifies, scores, deduplicates, and routes discoveries into the Genesis KG and
RWL task loop.

Webhook endpoint: POST /memory-scout-ingest
Check endpoint:   POST /memory-scout-check

Run as a lightweight Flask service:
    python process_discovery.py --serve --port 8765

Or import and call process_discovery() directly from other scripts.
"""

import argparse
import hashlib
import json
import logging
import os
import sys
from datetime import datetime, timezone
from pathlib import Path

import requests

# ---------------------------------------------------------------------------
# Paths (all E: drive)
# ---------------------------------------------------------------------------

BASE_DIR = Path("E:/genesis-system")
KG_ENTITIES_DIR = BASE_DIR / "KNOWLEDGE_GRAPH" / "entities"
TASKS_FILE = BASE_DIR / "loop" / "tasks.json"
SCOUT_DIR = BASE_DIR / "n8n" / "memory_competitor_scout"
DISCOVERY_LOG = SCOUT_DIR / "discovery_log.jsonl"
KNOWN_SYSTEMS_FILE = SCOUT_DIR / "known_memory_systems.jsonl"

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
    ],
)
log = logging.getLogger("memory_scout")

# ---------------------------------------------------------------------------
# Scoring configuration
# ---------------------------------------------------------------------------

SCORING_WEIGHTS = {
    "open_source": 2,
    "python_support": 2,
    "graph_support": 2,
    "temporal_support": 2,
    "active_maintenance": 2,
}

SCORE_KG_THRESHOLD = 7       # Write KG entity
SCORE_EVAL_THRESHOLD = 8     # Create RWL evaluation task (must also be open_source)

# Keywords used to infer feature support from description + topics
PYTHON_KEYWORDS = {"python", "py", "pip", "pypi", "pytorch", "langchain", "llamaindex"}
GRAPH_KEYWORDS = {"graph", "knowledge-graph", "neo4j", "falkordb", "graphiti", "cypher", "rdf", "ontology"}
TEMPORAL_KEYWORDS = {"temporal", "time", "episodic", "versioned", "history", "timeline", "chronological"}
OPEN_SOURCE_INDICATORS = {"mit", "apache", "gpl", "bsd", "open-source", "opensource"}

# Tool type classification — keyword → type
TYPE_KEYWORDS = {
    "RAG": {"rag", "retrieval", "retrieval-augmented", "document-qa"},
    "KnowledgeGraph": {"knowledge-graph", "kg", "ontology", "graph-db", "falkordb", "neo4j", "graphiti", "rdf"},
    "EpisodicMemory": {"episodic", "conversation-memory", "session-memory", "memgpt", "letta", "recall"},
    "VectorStore": {"vector", "embedding", "faiss", "qdrant", "chroma", "weaviate", "pinecone", "lancedb"},
    "TemporalMemory": {"temporal", "time-aware", "versioned-memory", "timeline-memory"},
}


# ---------------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------------

def url_to_id(url: str) -> str:
    """Stable short ID from URL — used for deduplication."""
    return hashlib.md5(url.lower().strip().encode()).hexdigest()[:12]


def load_known_ids() -> set:
    """Load all known system URLs (as md5 IDs) from known_memory_systems.jsonl
    and discovery_log.jsonl to enable fast deduplication."""
    known = set()
    for fpath in [KNOWN_SYSTEMS_FILE, DISCOVERY_LOG]:
        if fpath.exists():
            with fpath.open("r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        entry = json.loads(line)
                        url = entry.get("url", "")
                        if url:
                            known.add(url_to_id(url))
                        # Also add by name (lowercased) for fuzzy dedup
                        name = entry.get("name", "").lower().strip()
                        if name:
                            known.add(name)
                    except json.JSONDecodeError:
                        pass
    return known


def classify_tool_type(description: str, topics: list) -> str:
    """Classify discovery into one of the known tool types."""
    text = (description + " " + " ".join(topics or [])).lower()
    scores = {tool_type: 0 for tool_type in TYPE_KEYWORDS}
    for tool_type, keywords in TYPE_KEYWORDS.items():
        for kw in keywords:
            if kw in text:
                scores[tool_type] += 1
    best = max(scores, key=scores.get)
    if scores[best] == 0:
        return "Other"
    return best


def score_relevance(item: dict) -> tuple[int, dict]:
    """
    Score a discovery 0-10 for Genesis relevance.

    Returns:
        (score: int, breakdown: dict)
    """
    text = (
        (item.get("description") or "")
        + " "
        + " ".join(item.get("topics") or [])
        + " "
        + (item.get("name") or "")
    ).lower()

    license_text = (item.get("license") or "").lower()
    pushed_at = item.get("pushed_at") or item.get("updated_at") or ""

    breakdown = {}

    # open_source
    is_open = (
        item.get("open_source") is True
        or any(k in license_text for k in OPEN_SOURCE_INDICATORS)
        or item.get("license") not in (None, "", "NOASSERTION", "Other")
    )
    breakdown["open_source"] = SCORING_WEIGHTS["open_source"] if is_open else 0

    # python_support
    has_python = any(k in text for k in PYTHON_KEYWORDS)
    breakdown["python_support"] = SCORING_WEIGHTS["python_support"] if has_python else 0

    # graph_support
    has_graph = any(k in text for k in GRAPH_KEYWORDS)
    breakdown["graph_support"] = SCORING_WEIGHTS["graph_support"] if has_graph else 0

    # temporal_support
    has_temporal = any(k in text for k in TEMPORAL_KEYWORDS)
    breakdown["temporal_support"] = SCORING_WEIGHTS["temporal_support"] if has_temporal else 0

    # active_maintenance — last push within 30 days
    is_active = False
    if pushed_at:
        try:
            if pushed_at.endswith("Z"):
                pushed_at = pushed_at[:-1] + "+00:00"
            pushed_dt = datetime.fromisoformat(pushed_at)
            if pushed_dt.tzinfo is None:
                pushed_dt = pushed_dt.replace(tzinfo=timezone.utc)
            days_ago = (datetime.now(timezone.utc) - pushed_dt).days
            is_active = days_ago <= 30
        except (ValueError, TypeError):
            pass
    breakdown["active_maintenance"] = SCORING_WEIGHTS["active_maintenance"] if is_active else 0

    total = sum(breakdown.values())
    return total, breakdown


def build_kg_entity(item: dict, score: int, tool_type: str, breakdown: dict) -> dict:
    """Build a KG entity dict from a discovery."""
    return {
        "id": f"memory_competitor_{url_to_id(item.get('url', ''))}",
        "name": item.get("name", "Unknown"),
        "url": item.get("url", ""),
        "type": tool_type,
        "open_source": breakdown.get("open_source", 0) > 0,
        "genesis_integrated": False,
        "discovered_date": datetime.now(timezone.utc).isoformat(),
        "relevance_score": score,
        "score_breakdown": breakdown,
        "source": item.get("source", "unknown"),
        "stars": item.get("stars", 0),
        "description": (item.get("description") or "")[:500],
        "topics": item.get("topics") or [],
        "notes": item.get("notes", ""),
        "evaluated": False,
        "evaluation_task_id": None,
    }


def build_evaluation_task(item: dict, score: int, tool_type: str) -> dict:
    """Build an RWL evaluation task for a high-scoring discovery."""
    tool_name = item.get("name", "Unknown Tool")
    tool_url = item.get("url", "")
    task_id = f"eval_{url_to_id(tool_url)}_{datetime.now(timezone.utc).strftime('%Y%m%d')}"

    return {
        "id": task_id,
        "type": "memory_tool_evaluation",
        "title": f"Evaluate {tool_name} for Genesis integration",
        "tool_name": tool_name,
        "tool_url": tool_url,
        "tool_type": tool_type,
        "relevance_score": score,
        "status": "pending",
        "priority": "high" if score >= 9 else "medium",
        "created_at": datetime.now(timezone.utc).isoformat(),
        "created_by": "memory_competitor_scout",
        "instructions": (
            f"Evaluate {tool_name} ({tool_url}) for potential Genesis memory stack integration.\n"
            "Steps:\n"
            "1. Clone/install the repo and review architecture (README, src structure)\n"
            "2. Identify unique capabilities vs Genesis's current 5-tier stack\n"
            "   (PostgreSQL + Qdrant + Redis + FalkorDB/Graphiti)\n"
            "3. Test Python API with a simple 10-message conversation memory test\n"
            "4. Assess: integration effort (hours), unique value-add, overlap with existing\n"
            "5. Recommend: Integrate / Monitor / Skip — with justification\n"
            "6. Update known_memory_systems.jsonl with evaluation results\n"
            "7. If Integrate: create implementation PRD"
        ),
        "description": (item.get("description") or "")[:300],
        "stars": item.get("stars", 0),
        "source": item.get("source", "unknown"),
    }


def append_to_jsonl(fpath: Path, record: dict) -> None:
    """Append a JSON record to a .jsonl file (creates file if missing)."""
    fpath.parent.mkdir(parents=True, exist_ok=True)
    with fpath.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
    log.info("Wrote to %s: %s", fpath.name, record.get("id") or record.get("name", ""))


def upsert_tasks_json(task: dict) -> None:
    """Add or update a task in loop/tasks.json."""
    TASKS_FILE.parent.mkdir(parents=True, exist_ok=True)
    tasks = []
    if TASKS_FILE.exists():
        try:
            with TASKS_FILE.open("r", encoding="utf-8") as f:
                tasks = json.load(f)
        except (json.JSONDecodeError, IOError):
            tasks = []

    # Check for existing task with same tool_url
    task_url = task.get("tool_url", "")
    existing_ids = {t.get("tool_url", ""): i for i, t in enumerate(tasks)}
    if task_url in existing_ids:
        log.info("Task for %s already exists in tasks.json — skipping", task.get("tool_name"))
        return

    tasks.append(task)
    with TASKS_FILE.open("w", encoding="utf-8") as f:
        json.dump(tasks, f, indent=2, ensure_ascii=False)
    log.info("Added evaluation task to tasks.json: %s", task["id"])


# ---------------------------------------------------------------------------
# Main processing function
# ---------------------------------------------------------------------------

def process_discovery(item: dict, dry_run: bool = False) -> dict:
    """
    Process a single discovery item from n8n or the cron scout.

    Args:
        item: Dict with keys: name, url, description, topics, stars,
              license, pushed_at, source, open_source (optional)
        dry_run: If True, print actions without writing any files

    Returns:
        Result dict with: score, tool_type, action_taken, kg_entity_id, task_id
    """
    name = item.get("name", "Unknown")
    url = item.get("url", "")

    log.info("Processing: %s — %s", name, url)

    # Deduplication check
    known_ids = load_known_ids()
    item_id = url_to_id(url)
    item_name_lower = name.lower().strip()

    if item_id in known_ids or item_name_lower in known_ids:
        log.info("SKIP (already known): %s", name)
        return {"action_taken": "skipped_duplicate", "name": name, "url": url}

    # Classify and score
    tool_type = classify_tool_type(
        item.get("description", ""),
        item.get("topics") or [],
    )
    score, breakdown = score_relevance(item)

    log.info(
        "Scored %s: %d/10 | type=%s | breakdown=%s",
        name, score, tool_type, breakdown,
    )

    result = {
        "name": name,
        "url": url,
        "score": score,
        "tool_type": tool_type,
        "breakdown": breakdown,
        "action_taken": "logged_only",
        "kg_entity_id": None,
        "task_id": None,
    }

    # Always log the discovery
    log_entry = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "name": name,
        "url": url,
        "score": score,
        "tool_type": tool_type,
        "source": item.get("source", "unknown"),
        "description": (item.get("description") or "")[:200],
        "stars": item.get("stars", 0),
        "action": "pending",
    }

    # Write KG entity if score meets threshold
    if score >= SCORE_KG_THRESHOLD:
        kg_entity = build_kg_entity(item, score, tool_type, breakdown)
        result["kg_entity_id"] = kg_entity["id"]
        log_entry["action"] = "kg_entity_written"
        result["action_taken"] = "kg_entity_written"

        if not dry_run:
            kg_output_file = KG_ENTITIES_DIR / "memory_competitors_discovered.jsonl"
            append_to_jsonl(kg_output_file, kg_entity)
        else:
            log.info("[DRY RUN] Would write KG entity: %s", kg_entity["id"])

    # Create evaluation task if score >= eval threshold AND open_source
    is_open_source = breakdown.get("open_source", 0) > 0
    if score >= SCORE_EVAL_THRESHOLD and is_open_source:
        task = build_evaluation_task(item, score, tool_type)
        result["task_id"] = task["id"]
        log_entry["action"] = "evaluation_task_created"
        result["action_taken"] = "evaluation_task_created"

        if not dry_run:
            upsert_tasks_json(task)
            # Update KG entity with task reference
            if result["kg_entity_id"]:
                log.info("Evaluation task %s linked to KG entity %s", task["id"], result["kg_entity_id"])
        else:
            log.info("[DRY RUN] Would create evaluation task: %s", task["id"])

    if not dry_run:
        log_entry["action"] = result["action_taken"]
        append_to_jsonl(DISCOVERY_LOG, log_entry)
    else:
        log.info("[DRY RUN] Would log: %s", log_entry)

    log.info(
        "Result for %s: action=%s score=%d type=%s",
        name, result["action_taken"], score, tool_type,
    )
    return result


# ---------------------------------------------------------------------------
# Webhook check endpoint helper
# ---------------------------------------------------------------------------

def check_if_known(url: str) -> dict:
    """
    Check if a URL is already in the Genesis KG.
    Returns JSON-serialisable dict for webhook response.
    """
    known_ids = load_known_ids()
    item_id = url_to_id(url)
    is_known = item_id in known_ids
    return {
        "url": url,
        "known": is_known,
        "checked_at": datetime.now(timezone.utc).isoformat(),
    }


# ---------------------------------------------------------------------------
# Flask webhook server (optional — for n8n integration)
# ---------------------------------------------------------------------------

def run_server(port: int = 8765) -> None:
    """Start a minimal Flask webhook server for n8n integration."""
    try:
        from flask import Flask, jsonify, request
    except ImportError:
        log.error("Flask not installed. Install with: pip install flask")
        sys.exit(1)

    app = Flask("memory_scout")

    @app.route("/memory-scout-check", methods=["POST"])
    def check_endpoint():
        data = request.get_json(force=True) or {}
        url = data.get("url", "")
        if not url:
            return jsonify({"error": "url required"}), 400
        return jsonify(check_if_known(url))

    @app.route("/memory-scout-ingest", methods=["POST"])
    def ingest_endpoint():
        data = request.get_json(force=True) or {}
        if not data.get("url"):
            return jsonify({"error": "url required"}), 400
        result = process_discovery(data)
        return jsonify(result)

    @app.route("/health", methods=["GET"])
    def health():
        return jsonify({"status": "ok", "service": "memory_competitor_scout"})

    log.info("Starting memory scout webhook server on port %d", port)
    app.run(host="0.0.0.0", port=port, debug=False)


# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Memory Competitor Intelligence Scout — webhook processor"
    )
    subparsers = parser.add_subparsers(dest="command")

    # serve command
    serve_parser = subparsers.add_parser("serve", help="Start webhook server")
    serve_parser.add_argument("--port", type=int, default=8765)

    # test command — process a single test item
    test_parser = subparsers.add_parser("test", help="Test process a single discovery")
    test_parser.add_argument("--url", required=True, help="Repository/article URL")
    test_parser.add_argument("--name", required=True, help="Tool name")
    test_parser.add_argument("--description", default="", help="Description text")
    test_parser.add_argument("--stars", type=int, default=0)
    test_parser.add_argument("--dry-run", action="store_true")

    # check command
    check_parser = subparsers.add_parser("check", help="Check if a URL is already known")
    check_parser.add_argument("--url", required=True)

    args = parser.parse_args()

    if args.command == "serve":
        run_server(args.port)
    elif args.command == "test":
        item = {
            "name": args.name,
            "url": args.url,
            "description": args.description,
            "stars": args.stars,
            "source": "manual_test",
            "topics": [],
            "license": "MIT",
            "pushed_at": datetime.now(timezone.utc).isoformat(),
        }
        result = process_discovery(item, dry_run=args.dry_run)
        print(json.dumps(result, indent=2))
    elif args.command == "check":
        result = check_if_known(args.url)
        print(json.dumps(result, indent=2))
    else:
        parser.print_help()


if __name__ == "__main__":
    main()