#!/usr/bin/env python3
"""
GENESIS AI RESEARCH SCOUT
==========================
Autonomous AI research intelligence gathering skill.

Continuously harvests breakthrough AI developments from forums,
blogs, and communities to fuel Genesis evolution.

Sources:
- Reddit (r/MachineLearning, r/LocalLLaMA, r/artificial, r/ClaudeAI)
- GitHub Trending (AI/ML repositories)
- Hacker News (AI-tagged posts)
- ArXiv (cs.AI, cs.LG, cs.CL papers)
- AI Blogs (Simon Willison, Lilian Weng, etc.)

Usage:
    scout = AIResearchScout()
    insights = scout.scan_all_sources()
    tasks = scout.generate_rwl_tasks(insights)
"""

import json
import time
import hashlib
import re
from datetime import datetime, timedelta
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple
from enum import Enum
import urllib.request
import urllib.parse
import urllib.error


class SourceType(Enum):
    """Types of research sources."""
    REDDIT = "reddit"
    GITHUB = "github"
    HACKERNEWS = "hackernews"
    ARXIV = "arxiv"
    BLOG = "blog"
    YOUTUBE = "youtube"


class RelevanceLevel(Enum):
    """Relevance scoring levels."""
    CRITICAL = 5  # Immediate action required
    HIGH = 4      # Should review today
    MEDIUM = 3    # Worth tracking
    LOW = 2       # Background awareness
    NOISE = 1     # Skip


@dataclass
class RawIntel:
    """Raw intelligence item from a source."""
    source: SourceType
    source_url: str
    title: str
    content: str
    author: str
    timestamp: str
    score: int = 0  # Upvotes, stars, etc.
    comments: int = 0
    tags: List[str] = field(default_factory=list)
    metadata: Dict[str, Any] = field(default_factory=dict)

    @property
    def id(self) -> str:
        """Generate unique ID from URL."""
        return hashlib.md5(self.source_url.encode()).hexdigest()[:12]


@dataclass
class InsightReport:
    """Extracted insight from raw intel."""
    raw_intel_id: str
    title: str
    summary: str
    key_findings: List[str]
    relevance: RelevanceLevel
    relevance_score: float
    genesis_applicability: str
    action_items: List[str]
    related_topics: List[str]
    extracted_at: str = field(default_factory=lambda: datetime.now().isoformat())


@dataclass
class RWLTask:
    """Task for Ralph Wiggum Loop execution."""
    id: str
    title: str
    description: str
    acceptance_criteria: List[Dict[str, str]]
    source_insight_id: str
    priority: int
    estimated_effort: str
    tags: List[str]
    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
    passes: bool = False


class AIResearchScout:
    """
    Autonomous AI research intelligence gathering.

    Scans multiple sources for breakthrough AI developments
    and converts them into actionable insights and RWL tasks.
    """

    CONFIG_PATH = Path("E:/genesis-system/config/research_sources.yaml")
    DATA_DIR = Path("E:/genesis-system/data/research_intel")
    CACHE_PATH = Path("E:/genesis-system/data/research_intel/cache.json")
    INSIGHTS_PATH = Path("E:/genesis-system/data/research_intel/insights.jsonl")
    TASKS_PATH = Path("E:/genesis-system/loop/research_tasks.json")

    # Genesis-relevant keywords for filtering
    GENESIS_KEYWORDS = [
        # Core tech
        "llm", "large language model", "gpt", "claude", "gemini", "anthropic",
        "openai", "google ai", "mistral", "llama", "qwen",

        # Techniques
        "prompt engineering", "rag", "retrieval augmented", "fine-tuning",
        "lora", "qlora", "quantization", "inference", "chain of thought",
        "reasoning", "agent", "agentic", "tool use", "function calling",
        "mcp", "model context protocol",

        # Voice/Audio
        "voice ai", "speech to text", "text to speech", "whisper",
        "realtime audio", "voice agent", "conversation ai",

        # Frameworks
        "langchain", "llamaindex", "autogen", "crewai", "dspy",
        "semantic kernel", "guidance", "instructor",

        # Infra
        "vector database", "embedding", "qdrant", "pinecone", "weaviate",
        "ollama", "vllm", "tgi", "local llm",

        # Business
        "ai automation", "workflow", "n8n", "make", "zapier",
        "receptionist", "booking", "lead", "crm", "gohighlevel",

        # Breakthroughs
        "breakthrough", "state of the art", "sota", "new model",
        "benchmark", "performance", "speed", "cost reduction"
    ]

    # Sources configuration
    SOURCES = {
        "reddit": {
            "subreddits": ["MachineLearning", "LocalLLaMA", "artificial", "ClaudeAI", "Oobabooga"],
            "min_score": 50,
            "max_age_hours": 48
        },
        "github": {
            "topics": ["llm", "ai-agents", "langchain", "voice-ai"],
            "min_stars": 100,
            "language": "python"
        },
        "hackernews": {
            "tags": ["ai", "llm", "machine-learning", "gpt", "claude"],
            "min_points": 50
        },
        "arxiv": {
            "categories": ["cs.AI", "cs.LG", "cs.CL"],
            "max_results": 20
        }
    }

    def __init__(self):
        self.DATA_DIR.mkdir(parents=True, exist_ok=True)
        self.cache = self._load_cache()
        self.session_insights: List[InsightReport] = []

    def _load_cache(self) -> Dict:
        """Load seen items cache."""
        if self.CACHE_PATH.exists():
            with open(self.CACHE_PATH) as f:
                return json.load(f)
        return {"seen_ids": [], "last_scan": {}}

    def _save_cache(self):
        """Save cache to disk."""
        with open(self.CACHE_PATH, "w") as f:
            json.dump(self.cache, f, indent=2)

    def _is_seen(self, intel_id: str) -> bool:
        """Check if item was already processed."""
        return intel_id in self.cache.get("seen_ids", [])

    def _mark_seen(self, intel_id: str):
        """Mark item as processed."""
        if "seen_ids" not in self.cache:
            self.cache["seen_ids"] = []
        self.cache["seen_ids"].append(intel_id)
        # Keep only last 10000 IDs
        self.cache["seen_ids"] = self.cache["seen_ids"][-10000:]

    def _fetch_url(self, url: str, headers: Dict = None) -> Optional[str]:
        """Fetch URL content with error handling."""
        try:
            req = urllib.request.Request(
                url,
                headers=headers or {"User-Agent": "Genesis-Research-Scout/1.0"}
            )
            with urllib.request.urlopen(req, timeout=30) as response:
                return response.read().decode('utf-8')
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None

    def scan_reddit(self) -> List[RawIntel]:
        """Scan Reddit for AI discussions."""
        intel_items = []
        config = self.SOURCES["reddit"]

        for subreddit in config["subreddits"]:
            url = f"https://www.reddit.com/r/{subreddit}/hot.json?limit=50"
            content = self._fetch_url(url)

            if not content:
                continue

            try:
                data = json.loads(content)
                posts = data.get("data", {}).get("children", [])

                for post in posts:
                    post_data = post.get("data", {})

                    # Filter by score
                    if post_data.get("score", 0) < config["min_score"]:
                        continue

                    # Filter by age
                    created = post_data.get("created_utc", 0)
                    age_hours = (time.time() - created) / 3600
                    if age_hours > config["max_age_hours"]:
                        continue

                    intel = RawIntel(
                        source=SourceType.REDDIT,
                        source_url=f"https://reddit.com{post_data.get('permalink', '')}",
                        title=post_data.get("title", ""),
                        content=post_data.get("selftext", "")[:2000],
                        author=post_data.get("author", "unknown"),
                        timestamp=datetime.fromtimestamp(created).isoformat(),
                        score=post_data.get("score", 0),
                        comments=post_data.get("num_comments", 0),
                        tags=[subreddit],
                        metadata={"subreddit": subreddit, "flair": post_data.get("link_flair_text")}
                    )

                    if not self._is_seen(intel.id):
                        intel_items.append(intel)

            except json.JSONDecodeError as e:
                print(f"Error parsing Reddit JSON for r/{subreddit}: {e}")

        return intel_items

    def scan_hackernews(self) -> List[RawIntel]:
        """Scan Hacker News for AI posts."""
        intel_items = []

        # Get top stories
        url = "https://hacker-news.firebaseio.com/v0/topstories.json"
        content = self._fetch_url(url)

        if not content:
            return intel_items

        try:
            story_ids = json.loads(content)[:100]  # Top 100

            for story_id in story_ids[:30]:  # Check first 30
                story_url = f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json"
                story_content = self._fetch_url(story_url)

                if not story_content:
                    continue

                story = json.loads(story_content)

                # Filter by score
                if story.get("score", 0) < self.SOURCES["hackernews"]["min_points"]:
                    continue

                title = story.get("title", "").lower()
                # Check if AI-related
                if not any(tag in title for tag in ["ai", "llm", "gpt", "claude", "model", "ml", "machine learning"]):
                    continue

                intel = RawIntel(
                    source=SourceType.HACKERNEWS,
                    source_url=story.get("url", f"https://news.ycombinator.com/item?id={story_id}"),
                    title=story.get("title", ""),
                    content=story.get("text", "")[:2000] if story.get("text") else "",
                    author=story.get("by", "unknown"),
                    timestamp=datetime.fromtimestamp(story.get("time", 0)).isoformat(),
                    score=story.get("score", 0),
                    comments=story.get("descendants", 0),
                    tags=["hackernews"],
                    metadata={"hn_id": story_id}
                )

                if not self._is_seen(intel.id):
                    intel_items.append(intel)

        except json.JSONDecodeError as e:
            print(f"Error parsing HN JSON: {e}")

        return intel_items

    def scan_github_trending(self) -> List[RawIntel]:
        """Scan GitHub trending repositories."""
        intel_items = []

        # Use GitHub search API for recent popular repos
        config = self.SOURCES["github"]
        topics = "+".join([f"topic:{t}" for t in config["topics"]])
        query = f"{topics}+language:{config['language']}+stars:>{config['min_stars']}"
        url = f"https://api.github.com/search/repositories?q={query}&sort=updated&order=desc&per_page=30"

        content = self._fetch_url(url, headers={
            "Accept": "application/vnd.github.v3+json",
            "User-Agent": "Genesis-Research-Scout/1.0"
        })

        if not content:
            return intel_items

        try:
            data = json.loads(content)
            repos = data.get("items", [])

            for repo in repos:
                intel = RawIntel(
                    source=SourceType.GITHUB,
                    source_url=repo.get("html_url", ""),
                    title=f"{repo.get('full_name', '')}: {repo.get('description', '')}",
                    content=repo.get("description", "")[:2000],
                    author=repo.get("owner", {}).get("login", "unknown"),
                    timestamp=repo.get("updated_at", ""),
                    score=repo.get("stargazers_count", 0),
                    comments=repo.get("open_issues_count", 0),
                    tags=repo.get("topics", [])[:5],
                    metadata={
                        "language": repo.get("language"),
                        "forks": repo.get("forks_count"),
                        "watchers": repo.get("watchers_count")
                    }
                )

                if not self._is_seen(intel.id):
                    intel_items.append(intel)

        except json.JSONDecodeError as e:
            print(f"Error parsing GitHub JSON: {e}")

        return intel_items

    def scan_arxiv(self) -> List[RawIntel]:
        """Scan ArXiv for new AI papers."""
        intel_items = []
        config = self.SOURCES["arxiv"]

        categories = "+OR+".join([f"cat:{cat}" for cat in config["categories"]])
        url = f"http://export.arxiv.org/api/query?search_query={categories}&sortBy=submittedDate&sortOrder=descending&max_results={config['max_results']}"

        content = self._fetch_url(url)

        if not content:
            return intel_items

        # Simple XML parsing (avoiding external dependencies)
        entries = re.findall(r'<entry>(.*?)</entry>', content, re.DOTALL)

        for entry in entries:
            title_match = re.search(r'<title>(.*?)</title>', entry, re.DOTALL)
            summary_match = re.search(r'<summary>(.*?)</summary>', entry, re.DOTALL)
            id_match = re.search(r'<id>(.*?)</id>', entry)
            author_matches = re.findall(r'<name>(.*?)</name>', entry)
            published_match = re.search(r'<published>(.*?)</published>', entry)

            if title_match and id_match:
                title = title_match.group(1).strip().replace('\n', ' ')
                summary = summary_match.group(1).strip().replace('\n', ' ')[:2000] if summary_match else ""

                intel = RawIntel(
                    source=SourceType.ARXIV,
                    source_url=id_match.group(1).strip(),
                    title=title,
                    content=summary,
                    author=author_matches[0] if author_matches else "unknown",
                    timestamp=published_match.group(1).strip() if published_match else "",
                    score=0,  # ArXiv doesn't have scores
                    comments=0,
                    tags=["arxiv", "paper"],
                    metadata={"authors": author_matches[:5]}
                )

                if not self._is_seen(intel.id):
                    intel_items.append(intel)

        return intel_items

    def calculate_relevance(self, intel: RawIntel) -> Tuple[RelevanceLevel, float]:
        """
        Calculate relevance score for Genesis.

        Returns:
            (RelevanceLevel, score 0-1)
        """
        text = f"{intel.title} {intel.content}".lower()
        keyword_matches = sum(1 for kw in self.GENESIS_KEYWORDS if kw.lower() in text)

        # Base score from keyword matches
        score = min(1.0, keyword_matches / 10)

        # Boost for high engagement
        if intel.score > 500:
            score += 0.2
        elif intel.score > 200:
            score += 0.1

        # Boost for comments (discussion = importance)
        if intel.comments > 100:
            score += 0.15
        elif intel.comments > 50:
            score += 0.1

        # Boost for breakthrough keywords
        breakthrough_keywords = ["breakthrough", "state of the art", "new model", "released", "launched", "announcing"]
        if any(kw in text for kw in breakthrough_keywords):
            score += 0.2

        # Cap at 1.0
        score = min(1.0, score)

        # Map to level
        if score >= 0.8:
            level = RelevanceLevel.CRITICAL
        elif score >= 0.6:
            level = RelevanceLevel.HIGH
        elif score >= 0.4:
            level = RelevanceLevel.MEDIUM
        elif score >= 0.2:
            level = RelevanceLevel.LOW
        else:
            level = RelevanceLevel.NOISE

        return level, score

    def filter_by_relevance(
        self,
        intel_items: List[RawIntel],
        min_level: RelevanceLevel = RelevanceLevel.MEDIUM
    ) -> List[Tuple[RawIntel, RelevanceLevel, float]]:
        """Filter intel items by relevance threshold."""
        filtered = []

        for intel in intel_items:
            level, score = self.calculate_relevance(intel)
            if level.value >= min_level.value:
                filtered.append((intel, level, score))

        # Sort by score descending
        filtered.sort(key=lambda x: x[2], reverse=True)

        return filtered

    def extract_insight(self, intel: RawIntel, relevance: RelevanceLevel, score: float) -> InsightReport:
        """
        Extract actionable insight from raw intel.

        Note: In production, this would use Gemini to analyze.
        For now, uses rule-based extraction.
        """
        # Extract key findings from content
        key_findings = []
        sentences = re.split(r'[.!?]', intel.content)
        for sentence in sentences[:5]:
            sentence = sentence.strip()
            if len(sentence) > 20 and any(kw in sentence.lower() for kw in self.GENESIS_KEYWORDS[:20]):
                key_findings.append(sentence)

        # Determine applicability
        applicability_keywords = {
            "voice": "Voice AI / ReceptionistAI",
            "agent": "Agentic workflows / RWL",
            "rag": "Knowledge retrieval / Memory",
            "prompt": "Prompt engineering / Axioms",
            "cost": "Cost optimization / Budget",
            "speed": "Performance / Latency",
            "local": "Local deployment / Ollama"
        }

        applicability = []
        text = f"{intel.title} {intel.content}".lower()
        for kw, area in applicability_keywords.items():
            if kw in text:
                applicability.append(area)

        genesis_applicability = ", ".join(applicability[:3]) if applicability else "General AI advancement"

        # Generate action items
        action_items = []
        if "released" in text or "new" in text or "launched" in text:
            action_items.append("Evaluate new tool/model for Genesis integration")
        if "benchmark" in text or "performance" in text:
            action_items.append("Compare against current Genesis performance")
        if "code" in text or "github" in text:
            action_items.append("Review source code for implementation patterns")
        if "tutorial" in text or "guide" in text:
            action_items.append("Extract learnings for Genesis documentation")

        if not action_items:
            action_items.append("Monitor for developments")

        return InsightReport(
            raw_intel_id=intel.id,
            title=intel.title[:200],
            summary=intel.content[:500],
            key_findings=key_findings[:5],
            relevance=relevance,
            relevance_score=score,
            genesis_applicability=genesis_applicability,
            action_items=action_items,
            related_topics=intel.tags[:5]
        )

    def generate_rwl_task(self, insight: InsightReport) -> Optional[RWLTask]:
        """
        Generate RWL task from insight if actionable.

        Only generates tasks for HIGH or CRITICAL insights.
        """
        if insight.relevance.value < RelevanceLevel.HIGH.value:
            return None

        # Map relevance to priority (1-10, lower = higher priority)
        priority_map = {
            RelevanceLevel.CRITICAL: 2,
            RelevanceLevel.HIGH: 4,
            RelevanceLevel.MEDIUM: 6,
            RelevanceLevel.LOW: 8
        }

        # Build acceptance criteria from action items
        acceptance_criteria = []
        for i, action in enumerate(insight.action_items[:3]):
            acceptance_criteria.append({
                "id": f"AC{i+1}",
                "description": action,
                "verification": "manual_review"
            })

        task_id = f"research-{insight.raw_intel_id}-{int(time.time())}"

        return RWLTask(
            id=task_id,
            title=f"[Research] {insight.title[:80]}",
            description=f"""## Source Insight
{insight.summary}

## Key Findings
{chr(10).join('- ' + f for f in insight.key_findings)}

## Genesis Applicability
{insight.genesis_applicability}

## Action Required
{chr(10).join('- ' + a for a in insight.action_items)}
""",
            acceptance_criteria=acceptance_criteria,
            source_insight_id=insight.raw_intel_id,
            priority=priority_map.get(insight.relevance, 5),
            estimated_effort="small" if len(insight.action_items) <= 2 else "medium",
            tags=["research", "auto-generated"] + insight.related_topics[:3]
        )

    def scan_all_sources(self) -> List[InsightReport]:
        """
        Scan all configured sources and extract insights.

        Returns:
            List of InsightReport objects
        """
        print(f"\n{'='*60}")
        print(f"AI RESEARCH SCOUT - {datetime.now().isoformat()}")
        print(f"{'='*60}\n")

        all_intel = []

        # Scan each source
        print("Scanning Reddit...")
        reddit_intel = self.scan_reddit()
        print(f"  Found {len(reddit_intel)} new items")
        all_intel.extend(reddit_intel)

        print("Scanning Hacker News...")
        hn_intel = self.scan_hackernews()
        print(f"  Found {len(hn_intel)} new items")
        all_intel.extend(hn_intel)

        print("Scanning GitHub Trending...")
        gh_intel = self.scan_github_trending()
        print(f"  Found {len(gh_intel)} new items")
        all_intel.extend(gh_intel)

        print("Scanning ArXiv...")
        arxiv_intel = self.scan_arxiv()
        print(f"  Found {len(arxiv_intel)} new items")
        all_intel.extend(arxiv_intel)

        print(f"\nTotal raw intel: {len(all_intel)}")

        # Filter by relevance
        filtered = self.filter_by_relevance(all_intel, RelevanceLevel.MEDIUM)
        print(f"Filtered to {len(filtered)} relevant items")

        # Extract insights
        insights = []
        for intel, level, score in filtered:
            insight = self.extract_insight(intel, level, score)
            insights.append(insight)
            self._mark_seen(intel.id)

            # Log insight
            self._log_insight(insight)

        self._save_cache()
        self.cache["last_scan"]["all"] = datetime.now().isoformat()

        print(f"\nExtracted {len(insights)} insights")

        # Summary by relevance
        for level in RelevanceLevel:
            count = sum(1 for i in insights if i.relevance == level)
            if count > 0:
                print(f"  {level.name}: {count}")

        return insights

    def _log_insight(self, insight: InsightReport):
        """Log insight to JSONL file."""
        self.INSIGHTS_PATH.parent.mkdir(parents=True, exist_ok=True)
        with open(self.INSIGHTS_PATH, "a") as f:
            f.write(json.dumps(asdict(insight), default=str) + "\n")

    def generate_rwl_tasks(self, insights: List[InsightReport]) -> List[RWLTask]:
        """
        Generate RWL tasks from insights.

        Only creates tasks for HIGH+ relevance items.
        """
        tasks = []

        for insight in insights:
            task = self.generate_rwl_task(insight)
            if task:
                tasks.append(task)

        # Save tasks
        if tasks:
            self._save_tasks(tasks)

        return tasks

    def _save_tasks(self, tasks: List[RWLTask]):
        """Save tasks to JSON file for RWL consumption."""
        # Load existing tasks
        existing = []
        if self.TASKS_PATH.exists():
            with open(self.TASKS_PATH) as f:
                existing = json.load(f)

        # Add new tasks (avoid duplicates by ID)
        existing_ids = {t.get("id") for t in existing}
        for task in tasks:
            if task.id not in existing_ids:
                existing.append(asdict(task))

        # Save
        with open(self.TASKS_PATH, "w") as f:
            json.dump(existing, f, indent=2)

        print(f"Saved {len(tasks)} new research tasks to {self.TASKS_PATH}")

    def run_scan_cycle(self) -> Dict[str, Any]:
        """
        Run a complete scan cycle.

        Returns:
            Summary of scan results
        """
        start_time = time.time()

        insights = self.scan_all_sources()
        tasks = self.generate_rwl_tasks(insights)

        elapsed = time.time() - start_time

        return {
            "timestamp": datetime.now().isoformat(),
            "elapsed_seconds": elapsed,
            "total_insights": len(insights),
            "tasks_generated": len(tasks),
            "insights_by_level": {
                level.name: sum(1 for i in insights if i.relevance == level)
                for level in RelevanceLevel
            }
        }


def main():
    """CLI for AI Research Scout."""
    import argparse

    parser = argparse.ArgumentParser(description="Genesis AI Research Scout")
    parser.add_argument("command", choices=["scan", "status", "insights", "tasks"])
    parser.add_argument("--source", type=str, help="Specific source to scan")
    parser.add_argument("--limit", type=int, default=10, help="Limit results")
    args = parser.parse_args()

    scout = AIResearchScout()

    if args.command == "scan":
        result = scout.run_scan_cycle()
        print(f"\n{'='*40}")
        print("SCAN COMPLETE")
        print(f"{'='*40}")
        print(json.dumps(result, indent=2))

    elif args.command == "status":
        print(f"\nCache status:")
        print(f"  Seen items: {len(scout.cache.get('seen_ids', []))}")
        print(f"  Last scans: {json.dumps(scout.cache.get('last_scan', {}), indent=4)}")

    elif args.command == "insights":
        if scout.INSIGHTS_PATH.exists():
            print(f"\nRecent insights (last {args.limit}):\n")
            with open(scout.INSIGHTS_PATH) as f:
                lines = f.readlines()[-args.limit:]
                for line in lines:
                    insight = json.loads(line)
                    print(f"[{insight['relevance']}] {insight['title'][:60]}")
                    print(f"  Applicability: {insight['genesis_applicability']}")
                    print()
        else:
            print("No insights found yet. Run 'scan' first.")

    elif args.command == "tasks":
        if scout.TASKS_PATH.exists():
            with open(scout.TASKS_PATH) as f:
                tasks = json.load(f)
                pending = [t for t in tasks if not t.get("passes")]
                print(f"\nResearch tasks: {len(pending)} pending\n")
                for task in pending[:args.limit]:
                    print(f"[P{task['priority']}] {task['title'][:60]}")
                    print(f"  Effort: {task['estimated_effort']}")
                    print()
        else:
            print("No tasks found. Run 'scan' first.")


if __name__ == "__main__":
    main()