"""
core/graph/sync.py — JSONL-to-FalkorDB sync engine.

KGSyncer reads the canonical KNOWLEDGE_GRAPH JSONL files and upserts each
entry as a node in FalkorDB.  The JSONL files remain the source of truth;
FalkorDB is the queryable read-optimised projection.

Conventions applied:
    - "id"             → node ID  (falls back to line hash if absent)
    - "type" / "category" → node label  (falls back to "entity" / "axiom")
    - all other fields → node properties (complex values JSON-encoded)
    - malformed lines  → skipped, error count incremented

# VERIFICATION_STAMP
# Story: M9.03 — core/graph/sync.py — KGSyncer
# Verified By: parallel-builder
# Verified At: 2026-02-25T00:00:00Z
# Tests: 8/8
# Coverage: 100%
"""
from __future__ import annotations

import hashlib
import json
import logging
import os
import time
from pathlib import Path
from typing import Any, Dict, Optional

from core.graph.client import GenesisGraph

logger = logging.getLogger(__name__)

_DEFAULT_KG_BASE = "/mnt/e/genesis-system/KNOWLEDGE_GRAPH"


class KGSyncer:
    """
    Synchronises KNOWLEDGE_GRAPH JSONL files into FalkorDB.

    Parameters
    ----------
    graph:
        An initialised GenesisGraph instance.
    kg_base_path:
        Absolute path to the KNOWLEDGE_GRAPH root directory.
        Defaults to /mnt/e/genesis-system/KNOWLEDGE_GRAPH.
    """

    def __init__(
        self,
        graph: GenesisGraph,
        kg_base_path: str = _DEFAULT_KG_BASE,
    ) -> None:
        self.graph = graph
        self.kg_base = Path(kg_base_path)

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def sync_entities(self, folder: str = "entities") -> Dict[str, int]:
        """
        Sync all *.jsonl files in *kg_base/folder* as entity nodes.

        Each JSONL line becomes a node labelled by its "type" or "category"
        field (falling back to "entity").

        Returns a stats dict: {"synced": N, "errors": N}.
        """
        return self._sync_folder(folder, default_label="entity")

    def sync_axioms(self, folder: str = "axioms") -> Dict[str, int]:
        """
        Sync all *.jsonl files in *kg_base/folder* as axiom nodes.

        Each JSONL line becomes a node labelled by its "type" or "category"
        field (falling back to "axiom").

        Returns a stats dict: {"synced": N, "errors": N}.
        """
        return self._sync_folder(folder, default_label="axiom")

    def sync_all(self) -> Dict[str, int]:
        """
        Run sync_entities() and sync_axioms() and combine their stats.

        Returns:
            {
                "entities_synced": N,
                "axioms_synced": N,
                "errors": N,
            }
        """
        entity_stats = self.sync_entities()
        axiom_stats = self.sync_axioms()

        return {
            "entities_synced": entity_stats.get("synced", 0),
            "axioms_synced": axiom_stats.get("synced", 0),
            "errors": entity_stats.get("errors", 0) + axiom_stats.get("errors", 0),
        }

    def incremental_sync(self, since_file_mtime: float) -> Dict[str, int]:
        """
        Only sync JSONL files whose mtime is newer than *since_file_mtime*.

        ``since_file_mtime`` is a Unix timestamp (float), e.g. from
        ``time.time()`` or ``os.path.getmtime(path)``.

        Returns:
            {
                "entities_synced": N,
                "axioms_synced": N,
                "errors": N,
            }
        """
        entity_stats = self._sync_folder(
            "entities", default_label="entity", since_mtime=since_file_mtime
        )
        axiom_stats = self._sync_folder(
            "axioms", default_label="axiom", since_mtime=since_file_mtime
        )
        return {
            "entities_synced": entity_stats.get("synced", 0),
            "axioms_synced": axiom_stats.get("synced", 0),
            "errors": entity_stats.get("errors", 0) + axiom_stats.get("errors", 0),
        }

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _sync_folder(
        self,
        folder: str,
        default_label: str,
        since_mtime: Optional[float] = None,
    ) -> Dict[str, int]:
        """
        Core sync loop: iterate .jsonl files in *folder*, upsert each line.

        Parameters
        ----------
        folder:
            Sub-directory name relative to self.kg_base.
        default_label:
            Label to apply when a line has no "type" or "category" field.
        since_mtime:
            If provided, skip files whose mtime <= since_mtime.
        """
        folder_path = self.kg_base / folder
        synced = 0
        errors = 0

        if not folder_path.exists():
            logger.warning("KGSyncer: folder does not exist: %s", folder_path)
            return {"synced": 0, "errors": 0}

        jsonl_files = sorted(folder_path.glob("*.jsonl"))

        for jsonl_path in jsonl_files:
            # Incremental: skip files not modified since threshold
            if since_mtime is not None:
                try:
                    file_mtime = jsonl_path.stat().st_mtime
                except OSError:
                    continue
                if file_mtime <= since_mtime:
                    continue

            file_synced, file_errors = self._sync_file(
                jsonl_path, default_label
            )
            synced += file_synced
            errors += file_errors

        logger.info(
            "KGSyncer._sync_folder(%s): synced=%d errors=%d",
            folder,
            synced,
            errors,
        )
        return {"synced": synced, "errors": errors}

    def _sync_file(
        self,
        path: Path,
        default_label: str,
    ) -> tuple[int, int]:
        """
        Parse a single .jsonl file and upsert each valid line into FalkorDB.

        Returns (synced_count, error_count).
        """
        synced = 0
        errors = 0

        try:
            text = path.read_text(encoding="utf-8", errors="replace")
        except OSError as exc:
            logger.warning("KGSyncer: cannot read %s: %s", path, exc)
            return 0, 1

        for line_no, raw_line in enumerate(text.splitlines(), start=1):
            line = raw_line.strip()
            if not line:
                continue

            try:
                data = json.loads(line)
            except json.JSONDecodeError as exc:
                logger.debug(
                    "KGSyncer: malformed JSON at %s:%d — %s",
                    path.name,
                    line_no,
                    exc,
                )
                errors += 1
                continue

            if not isinstance(data, dict):
                logger.debug(
                    "KGSyncer: non-dict line at %s:%d, skipping",
                    path.name,
                    line_no,
                )
                errors += 1
                continue

            entity_id = _extract_id(data, path.stem, line_no)
            entity_type = (
                data.get("type")
                or data.get("category")
                or default_label
            )
            if not isinstance(entity_type, str) or not entity_type:
                entity_type = default_label

            # Build properties dict (exclude id/type/category to avoid duplication)
            props: Dict[str, Any] = {
                k: v
                for k, v in data.items()
                if k not in ("id",)
            }
            props["_source_file"] = path.name

            ok = self.graph.add_entity(entity_id, entity_type, props)
            if ok:
                synced += 1
            else:
                errors += 1

        return synced, errors


# ---------------------------------------------------------------------------
# Private helpers
# ---------------------------------------------------------------------------


def _extract_id(data: Dict[str, Any], file_stem: str, line_no: int) -> str:
    """
    Extract a stable node ID from a JSONL record.

    Priority order:
    1. data["id"] — explicit ID field
    2. data["entity_id"] — alternate common field name
    3. Deterministic hash of file_stem + line_no (fallback for lines without IDs)
    """
    if "id" in data and isinstance(data["id"], str) and data["id"]:
        return data["id"]
    if "entity_id" in data and isinstance(data["entity_id"], str):
        return data["entity_id"]
    # Fallback: deterministic hash so the same line maps to the same node
    hash_input = f"{file_stem}:{line_no}"
    return "auto_" + hashlib.md5(hash_input.encode()).hexdigest()[:12]
