"""
Memory Competitor Intelligence Scout — Standalone Cron Fallback
===============================================================
Runs the full GitHub + Hacker News + arXiv intelligence sweep without requiring
n8n. Designed as a fallback when n8n is unavailable, or as a one-shot
manual trigger.

Usage:
    python scout_cron.py --dry-run          # Print discoveries, write nothing
    python scout_cron.py --ingest           # Write to KG + tasks.json
    python scout_cron.py --ingest --verbose # Verbose output

The script outputs a structured summary at the end showing:
  - Total items fetched per source
  - Items passing age filter (< 7 days)
  - Items that were new (not already known)
  - Items written to KG / tasks created

Dependencies: requests (pip install requests), everything else is stdlib.
"""

import argparse
import json
import logging
import sys
import time
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta, timezone
from pathlib import Path

import requests

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

BASE_DIR = Path("E:/genesis-system")
SCOUT_DIR = BASE_DIR / "n8n" / "memory_competitor_scout"
DISCOVERY_LOG = SCOUT_DIR / "discovery_log.jsonl"
KNOWN_SYSTEMS_FILE = SCOUT_DIR / "known_memory_systems.jsonl"

# GitHub search queries — two passes: stars (mature) + updated (bleeding edge)
GITHUB_QUERIES = [
    {
        "q": "ai memory knowledge-graph episodic",
        "sort": "stars",
        "order": "desc",
        "per_page": 20,
        "label": "github_stars",
    },
    {
        "q": "temporal memory LLM RAG",
        "sort": "updated",
        "order": "desc",
        "per_page": 20,
        "label": "github_updated",
    },
    {
        "q": "agent memory persistent session python",
        "sort": "updated",
        "order": "desc",
        "per_page": 15,
        "label": "github_agent_memory",
    },
]

GITHUB_API_BASE = "https://api.github.com/search/repositories"
HN_API_URL = "https://hn.algolia.com/api/v1/search?query=AI+memory+knowledge+graph&tags=show_hn&hitsPerPage=10"
ARXIV_API_URL = (
    "http://export.arxiv.org/api/query"
    "?search_query=all:episodic+memory+LLM"
    "&sortBy=lastUpdatedDate&sortOrder=descending&max_results=10"
)

MAX_AGE_DAYS = 7
REQUEST_TIMEOUT = 20
RATE_LIMIT_SLEEP = 1.5  # Seconds between GitHub API calls

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------

log = logging.getLogger("scout_cron")


def setup_logging(verbose: bool = False) -> None:
    level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[logging.StreamHandler(sys.stdout)],
    )


# ---------------------------------------------------------------------------
# Age filtering
# ---------------------------------------------------------------------------

def is_recent(date_str: str, max_days: int = MAX_AGE_DAYS) -> bool:
    """Return True if date_str is within max_days of now."""
    if not date_str:
        return False
    try:
        if date_str.endswith("Z"):
            date_str = date_str[:-1] + "+00:00"
        dt = datetime.fromisoformat(date_str)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return (datetime.now(timezone.utc) - dt) <= timedelta(days=max_days)
    except (ValueError, TypeError):
        return False


# ---------------------------------------------------------------------------
# Deduplication
# ---------------------------------------------------------------------------

import hashlib


def url_to_id(url: str) -> str:
    return hashlib.md5(url.lower().strip().encode()).hexdigest()[:12]


def load_known_ids() -> set:
    """Load known system identifiers from seed file + discovery log."""
    known = set()
    for fpath in [KNOWN_SYSTEMS_FILE, DISCOVERY_LOG]:
        if not fpath.exists():
            continue
        with fpath.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    entry = json.loads(line)
                    url = entry.get("url", "")
                    if url:
                        known.add(url_to_id(url))
                    name = entry.get("name", "").lower().strip()
                    if name:
                        known.add(name)
                except json.JSONDecodeError:
                    pass
    return known


def is_known(item: dict, known_ids: set) -> bool:
    url = item.get("url", "")
    name = item.get("name", "").lower().strip()
    return url_to_id(url) in known_ids or name in known_ids


# ---------------------------------------------------------------------------
# GitHub source
# ---------------------------------------------------------------------------

def fetch_github(query_config: dict, github_token: str = None) -> list:
    """
    Fetch repositories from GitHub search API.
    Returns list of normalised discovery dicts.
    """
    params = {
        "q": query_config["q"],
        "sort": query_config["sort"],
        "order": query_config["order"],
        "per_page": query_config["per_page"],
    }
    headers = {"Accept": "application/vnd.github+json"}
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"

    try:
        resp = requests.get(
            GITHUB_API_BASE,
            params=params,
            headers=headers,
            timeout=REQUEST_TIMEOUT,
        )
        resp.raise_for_status()
        data = resp.json()
    except requests.RequestException as exc:
        log.warning("GitHub fetch failed for query '%s': %s", query_config["q"], exc)
        return []

    items = data.get("items") or []
    results = []
    for repo in items:
        pushed_at = repo.get("pushed_at") or repo.get("updated_at") or ""
        if not is_recent(pushed_at):
            continue

        license_name = ""
        if repo.get("license") and isinstance(repo["license"], dict):
            license_name = repo["license"].get("spdx_id") or repo["license"].get("name") or ""

        results.append({
            "name": repo.get("name") or repo.get("full_name", ""),
            "url": repo.get("html_url", ""),
            "description": repo.get("description") or "",
            "stars": repo.get("stargazers_count", 0),
            "topics": repo.get("topics") or [],
            "license": license_name,
            "pushed_at": pushed_at,
            "source": query_config["label"],
            "raw_date": pushed_at,
        })

    log.info(
        "GitHub [%s]: fetched %d repos, %d recent",
        query_config["label"], len(items), len(results),
    )
    return results


# ---------------------------------------------------------------------------
# Hacker News source
# ---------------------------------------------------------------------------

def fetch_hacker_news() -> list:
    """
    Fetch Show HN posts about AI memory from Hacker News Algolia API.
    Returns list of normalised discovery dicts.
    """
    try:
        resp = requests.get(HN_API_URL, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        data = resp.json()
    except requests.RequestException as exc:
        log.warning("Hacker News fetch failed: %s", exc)
        return []

    hits = data.get("hits") or []
    results = []
    for hit in hits:
        created_at = hit.get("created_at") or hit.get("created_at_i")
        if isinstance(created_at, (int, float)):
            created_at = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()

        if not is_recent(created_at or ""):
            continue

        url = hit.get("url") or f"https://news.ycombinator.com/item?id={hit.get('objectID', '')}"
        title = hit.get("title") or ""
        story_text = hit.get("story_text") or ""

        results.append({
            "name": title,
            "url": url,
            "description": story_text[:400] if story_text else title,
            "stars": hit.get("points", 0),
            "topics": [],
            "license": "",
            "pushed_at": created_at or "",
            "source": "hackernews_show_hn",
            "raw_date": created_at or "",
            "hn_id": hit.get("objectID", ""),
        })

    log.info("Hacker News: fetched %d hits, %d recent", len(hits), len(results))
    return results


# ---------------------------------------------------------------------------
# arXiv source
# ---------------------------------------------------------------------------

ARXIV_NS = "http://www.w3.org/2005/Atom"


def fetch_arxiv() -> list:
    """
    Fetch recent arXiv papers on episodic memory + LLMs.
    Parses Atom XML returned by the arXiv API.
    Returns list of normalised discovery dicts.
    """
    try:
        resp = requests.get(ARXIV_API_URL, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        xml_content = resp.text
    except requests.RequestException as exc:
        log.warning("arXiv fetch failed: %s", exc)
        return []

    try:
        root = ET.fromstring(xml_content)
    except ET.ParseError as exc:
        log.warning("arXiv XML parse error: %s", exc)
        return []

    entries = root.findall(f"{{{ARXIV_NS}}}entry")
    results = []
    for entry in entries:
        updated_el = entry.find(f"{{{ARXIV_NS}}}updated")
        updated = updated_el.text if updated_el is not None else ""

        if not is_recent(updated or ""):
            continue

        title_el = entry.find(f"{{{ARXIV_NS}}}title")
        title = (title_el.text or "").strip().replace("\n", " ") if title_el is not None else ""

        summary_el = entry.find(f"{{{ARXIV_NS}}}summary")
        summary = (summary_el.text or "").strip().replace("\n", " ")[:500] if summary_el is not None else ""

        id_el = entry.find(f"{{{ARXIV_NS}}}id")
        arxiv_url = (id_el.text or "").strip() if id_el is not None else ""

        # Extract author names
        authors = []
        for author_el in entry.findall(f"{{{ARXIV_NS}}}author"):
            name_el = author_el.find(f"{{{ARXIV_NS}}}name")
            if name_el is not None and name_el.text:
                authors.append(name_el.text.strip())

        # Derive GitHub link if mentioned in summary
        github_url = ""
        if "github.com" in summary.lower():
            for token in summary.split():
                if "github.com" in token:
                    github_url = token.strip(".,)([]")
                    break

        results.append({
            "name": title,
            "url": github_url or arxiv_url,
            "description": summary,
            "stars": 0,
            "topics": ["arxiv", "research-paper", "llm", "memory"],
            "license": "",
            "pushed_at": updated or "",
            "source": "arxiv",
            "raw_date": updated or "",
            "authors": authors[:3],
            "arxiv_url": arxiv_url,
        })

    log.info("arXiv: fetched %d entries, %d recent", len(entries), len(results))
    return results


# ---------------------------------------------------------------------------
# Output helpers
# ---------------------------------------------------------------------------

def write_log_entry(item: dict, action: str, score: int = 0) -> None:
    """Append a discovery log entry."""
    SCOUT_DIR.mkdir(parents=True, exist_ok=True)
    entry = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "name": item.get("name", ""),
        "url": item.get("url", ""),
        "score": score,
        "source": item.get("source", ""),
        "description": (item.get("description") or "")[:200],
        "stars": item.get("stars", 0),
        "action": action,
    }
    with DISCOVERY_LOG.open("a", encoding="utf-8") as f:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")


def process_via_webhook(item: dict) -> dict:
    """
    Pass a discovery to the process_discovery.py webhook processor.
    Falls back to inline import if webhook server isn't running.
    """
    # Try to import locally (preferred — no network round-trip)
    try:
        sys.path.insert(0, str(SCOUT_DIR))
        from process_discovery import process_discovery
        return process_discovery(item, dry_run=False)
    except ImportError:
        log.warning("Could not import process_discovery — falling back to HTTP")

    # Fallback: HTTP POST to local webhook
    try:
        resp = requests.post(
            "http://localhost:8765/memory-scout-ingest",
            json=item,
            timeout=10,
        )
        resp.raise_for_status()
        return resp.json()
    except requests.RequestException as exc:
        log.error("Webhook POST failed: %s", exc)
        return {"action_taken": "error", "error": str(exc)}


# ---------------------------------------------------------------------------
# Main scout logic
# ---------------------------------------------------------------------------

def run_scout(
    dry_run: bool = False,
    ingest: bool = False,
    github_token: str = None,
    verbose: bool = False,
) -> dict:
    """
    Execute the full intelligence sweep.

    Args:
        dry_run: Print findings without writing anything
        ingest:  Write discoveries to KG + tasks.json
        github_token: GitHub personal access token (optional, increases rate limit)
        verbose: Extra logging

    Returns:
        Summary dict with counts per source and actions taken
    """
    setup_logging(verbose)
    log.info("Memory Competitor Intelligence Scout starting — mode=%s",
             "dry_run" if dry_run else ("ingest" if ingest else "report_only"))

    known_ids = load_known_ids()
    log.info("Loaded %d known system IDs for deduplication", len(known_ids))

    # Collect from all sources
    all_items = []

    # GitHub — multiple queries, rate-limit friendly
    for query in GITHUB_QUERIES:
        items = fetch_github(query, github_token=github_token)
        all_items.extend(items)
        time.sleep(RATE_LIMIT_SLEEP)

    # Hacker News
    hn_items = fetch_hacker_news()
    all_items.extend(hn_items)
    time.sleep(RATE_LIMIT_SLEEP)

    # arXiv
    arxiv_items = fetch_arxiv()
    all_items.extend(arxiv_items)

    log.info("Total items fetched across all sources: %d", len(all_items))

    # Deduplicate within this batch by URL
    seen_urls = set()
    unique_items = []
    for item in all_items:
        url = item.get("url", "")
        uid = url_to_id(url)
        if uid not in seen_urls and url:
            seen_urls.add(uid)
            unique_items.append(item)

    log.info("Unique items after batch deduplication: %d", len(unique_items))

    # Filter against known systems
    new_items = [item for item in unique_items if not is_known(item, known_ids)]
    log.info("New items (not in KG yet): %d", len(new_items))

    # Process or report
    summary = {
        "run_at": datetime.now(timezone.utc).isoformat(),
        "mode": "dry_run" if dry_run else ("ingest" if ingest else "report_only"),
        "total_fetched": len(all_items),
        "unique_fetched": len(unique_items),
        "new_items": len(new_items),
        "actions": [],
        "by_source": {},
    }

    for item in unique_items:
        src = item.get("source", "unknown")
        summary["by_source"].setdefault(src, {"total": 0, "new": 0})
        summary["by_source"][src]["total"] += 1
        if not is_known(item, known_ids):
            summary["by_source"][src]["new"] += 1

    for item in new_items:
        name = item.get("name", "")
        url = item.get("url", "")
        source = item.get("source", "")
        stars = item.get("stars", 0)

        if dry_run:
            log.info("[DRY RUN] Would process: %s (%s) — source=%s stars=%d", name, url, source, stars)
            summary["actions"].append({
                "name": name, "url": url, "action": "would_process", "source": source,
            })
            write_log_entry(item, "dry_run_candidate")
        elif ingest:
            result = process_via_webhook(item)
            action = result.get("action_taken", "unknown")
            log.info("Ingested %s: action=%s score=%s", name, action, result.get("score", "?"))
            summary["actions"].append({
                "name": name,
                "url": url,
                "action": action,
                "score": result.get("score"),
                "tool_type": result.get("tool_type"),
                "source": source,
                "kg_entity_id": result.get("kg_entity_id"),
                "task_id": result.get("task_id"),
            })
        else:
            # Report only — log but don't write to KG
            log.info("FOUND: %s (%s) — source=%s stars=%d", name, url, source, stars)
            desc = (item.get("description") or "")[:120]
            if desc:
                log.info("  Description: %s", desc)
            write_log_entry(item, "report_only")
            summary["actions"].append({
                "name": name, "url": url, "action": "reported", "source": source,
            })

    # Print summary
    print("\n" + "=" * 60)
    print("MEMORY SCOUT SWEEP SUMMARY")
    print("=" * 60)
    print(f"Run at:         {summary['run_at']}")
    print(f"Mode:           {summary['mode']}")
    print(f"Total fetched:  {summary['total_fetched']}")
    print(f"Unique items:   {summary['unique_fetched']}")
    print(f"New (unknown):  {summary['new_items']}")
    print()
    print("By source:")
    for src, counts in summary["by_source"].items():
        print(f"  {src:<30} total={counts['total']}  new={counts['new']}")

    if summary["actions"]:
        print()
        print("Actions taken:")
        for action_entry in summary["actions"]:
            score_str = f"  score={action_entry['score']}" if action_entry.get("score") is not None else ""
            type_str = f"  type={action_entry['tool_type']}" if action_entry.get("tool_type") else ""
            print(f"  [{action_entry['action']}]{score_str}{type_str}  {action_entry['name']}")
            print(f"    {action_entry['url']}")
    else:
        print("\nNo new discoveries this sweep.")

    print("=" * 60 + "\n")

    return summary


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Memory Competitor Intelligence Scout — standalone cron runner",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python scout_cron.py --dry-run            Print discoveries, write nothing
  python scout_cron.py --ingest             Write new items to KG + tasks.json
  python scout_cron.py --ingest --verbose   Verbose output during ingest
  python scout_cron.py --dry-run --token ghp_xxx  Use GitHub token for higher rate limits
        """,
    )

    mode_group = parser.add_mutually_exclusive_group(required=True)
    mode_group.add_argument(
        "--dry-run",
        action="store_true",
        help="Print discoveries without writing any files",
    )
    mode_group.add_argument(
        "--ingest",
        action="store_true",
        help="Process discoveries: write KG entities and evaluation tasks",
    )
    mode_group.add_argument(
        "--report",
        action="store_true",
        help="Log findings to discovery_log.jsonl but do not write to KG",
    )

    parser.add_argument(
        "--token",
        default=None,
        help="GitHub personal access token (increases rate limit from 60 to 5000/hr)",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable debug logging",
    )
    parser.add_argument(
        "--output-dir",
        default=None,
        help="Override output directory (default: E:\\genesis-system\\n8n\\memory_competitor_scout)",
    )

    args = parser.parse_args()

    # Allow output dir override
    if args.output_dir:
        global SCOUT_DIR, DISCOVERY_LOG
        SCOUT_DIR = Path(args.output_dir)
        DISCOVERY_LOG = SCOUT_DIR / "discovery_log.jsonl"

    # Read GitHub token from env if not passed as arg
    github_token = args.token or os.environ.get("GITHUB_TOKEN")

    summary = run_scout(
        dry_run=args.dry_run,
        ingest=args.ingest,
        github_token=github_token,
        verbose=args.verbose,
    )

    # Exit with non-zero if ingest mode found new items (useful for CI alerting)
    if args.ingest and summary.get("new_items", 0) > 0:
        sys.exit(0)
    sys.exit(0)


import os  # noqa: E402 (needed for GITHUB_TOKEN env var)

if __name__ == "__main__":
    main()