#!/usr/bin/env python3
"""
Web Intelligence Skill for Genesis System

This skill provides comprehensive web scraping and intelligence gathering
capabilities using Firecrawl-inspired patterns. It extracts branding information,
captures screenshots, generates site maps, and stores results in a knowledge base.

Usage:
    python web_intelligence.py <url>

    Or import and use programmatically:
    from web_intelligence import WebIntelligence
    intel = WebIntelligence()
    results = intel.analyze_website("https://example.com")
"""

import os
import sys
import json
import hashlib
import logging
from datetime import datetime
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
from urllib.parse import urlparse, urljoin
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Optional imports with graceful fallback
try:
    import requests
    REQUESTS_AVAILABLE = True
except ImportError:
    REQUESTS_AVAILABLE = False
    logger.warning("requests not installed. Install with: pip install requests")

try:
    from bs4 import BeautifulSoup
    BS4_AVAILABLE = True
except ImportError:
    BS4_AVAILABLE = False
    logger.warning("beautifulsoup4 not installed. Install with: pip install beautifulsoup4")

try:
    from PIL import Image
    PIL_AVAILABLE = True
except ImportError:
    PIL_AVAILABLE = False
    logger.warning("Pillow not installed. Install with: pip install Pillow")


@dataclass
class BrandingInfo:
    """Represents extracted branding information from a website."""
    site_name: str = ""
    tagline: str = ""
    logo_url: str = ""
    primary_colors: List[str] = None
    favicon_url: str = ""
    meta_description: str = ""
    social_links: Dict[str, str] = None

    def __post_init__(self):
        if self.primary_colors is None:
            self.primary_colors = []
        if self.social_links is None:
            self.social_links = {}


@dataclass
class PageInfo:
    """Represents information about a single page."""
    url: str
    title: str
    description: str = ""
    headings: List[str] = None
    links: List[str] = None
    images: List[str] = None
    text_content: str = ""
    word_count: int = 0

    def __post_init__(self):
        if self.headings is None:
            self.headings = []
        if self.links is None:
            self.links = []
        if self.images is None:
            self.images = []


@dataclass
class SiteMapEntry:
    """Represents an entry in the site map."""
    url: str
    title: str
    depth: int
    parent_url: str = ""
    children: List[str] = None

    def __post_init__(self):
        if self.children is None:
            self.children = []


@dataclass
class WebIntelligenceResult:
    """Complete result of web intelligence analysis."""
    url: str
    timestamp: str
    branding: BrandingInfo
    pages: List[PageInfo]
    sitemap: List[SiteMapEntry]
    screenshots: List[str]
    metadata: Dict[str, Any]

    def to_dict(self) -> Dict:
        """Convert to dictionary for serialization."""
        return {
            "url": self.url,
            "timestamp": self.timestamp,
            "branding": asdict(self.branding),
            "pages": [asdict(p) for p in self.pages],
            "sitemap": [asdict(s) for s in self.sitemap],
            "screenshots": self.screenshots,
            "metadata": self.metadata
        }


class WebIntelligence:
    """
    Web Intelligence gathering system using Firecrawl-inspired patterns.

    Features:
    - Branding extraction (logos, colors, taglines)
    - Screenshot capture
    - Site map generation
    - Content extraction and analysis
    - Knowledge base storage
    """

    def __init__(self, knowledge_base_path: str = None, max_depth: int = 2, max_pages: int = 50):
        """
        Initialize the Web Intelligence system.

        Args:
            knowledge_base_path: Path to store results (default: ./knowledge_base)
            max_depth: Maximum crawl depth for site mapping
            max_pages: Maximum pages to crawl
        """
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.visited_urls = set()
        self.session = requests.Session() if REQUESTS_AVAILABLE else None

        # Set up knowledge base
        if knowledge_base_path is None:
            knowledge_base_path = os.path.join(os.path.dirname(__file__), "..", "knowledge_base", "web_intel")
        self.knowledge_base_path = Path(knowledge_base_path)
        self.knowledge_base_path.mkdir(parents=True, exist_ok=True)

        # Configure session headers
        if self.session:
            self.session.headers.update({
                "User-Agent": "Genesis-WebIntelligence/1.0 (Educational Research Bot)",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
            })

    def _fetch_page(self, url: str, timeout: int = 30) -> Optional[str]:
        """Fetch a page's HTML content."""
        if not REQUESTS_AVAILABLE:
            logger.error("requests library not available")
            return None

        try:
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            logger.error(f"Failed to fetch {url}: {e}")
            return None

    def _parse_html(self, html: str) -> Optional[Any]:
        """Parse HTML content using BeautifulSoup."""
        if not BS4_AVAILABLE:
            logger.error("BeautifulSoup not available")
            return None
        return BeautifulSoup(html, 'html.parser')

    def extract_branding(self, url: str, soup: Any) -> BrandingInfo:
        """
        Extract branding information from a webpage.

        Uses multiple signals to identify brand elements:
        - Meta tags (og:site_name, description)
        - Logo images (common patterns)
        - Color extraction from CSS
        - Social media links
        """
        branding = BrandingInfo()

        if soup is None:
            return branding

        # Extract site name from various sources
        og_site_name = soup.find("meta", property="og:site_name")
        if og_site_name:
            branding.site_name = og_site_name.get("content", "")
        elif soup.title:
            branding.site_name = soup.title.string or ""

        # Extract tagline from h1 or meta description
        h1 = soup.find("h1")
        if h1:
            branding.tagline = h1.get_text(strip=True)

        # Extract meta description
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc:
            branding.meta_description = meta_desc.get("content", "")

        # Find logo - common patterns
        logo_patterns = [
            soup.find("img", class_=lambda x: x and "logo" in x.lower() if x else False),
            soup.find("img", id=lambda x: x and "logo" in x.lower() if x else False),
            soup.find("img", alt=lambda x: x and "logo" in x.lower() if x else False),
            soup.find("a", class_="logo"),
            soup.find("div", class_="logo"),
        ]

        for logo in logo_patterns:
            if logo:
                if logo.name == "img":
                    branding.logo_url = urljoin(url, logo.get("src", ""))
                elif logo.find("img"):
                    branding.logo_url = urljoin(url, logo.find("img").get("src", ""))
                break

        # Extract favicon
        favicon = soup.find("link", rel=lambda x: x and "icon" in str(x).lower() if x else False)
        if favicon:
            branding.favicon_url = urljoin(url, favicon.get("href", ""))

        # Extract social links
        social_platforms = ["twitter", "facebook", "linkedin", "instagram", "youtube", "github"]
        for link in soup.find_all("a", href=True):
            href = link.get("href", "").lower()
            for platform in social_platforms:
                if platform in href:
                    branding.social_links[platform] = link.get("href")
                    break

        # Extract primary colors from inline styles (simplified)
        colors = set()
        for tag in soup.find_all(style=True):
            style = tag.get("style", "")
            # Look for hex colors
            import re
            hex_colors = re.findall(r'#[0-9a-fA-F]{3,6}', style)
            colors.update(hex_colors)
        branding.primary_colors = list(colors)[:5]  # Limit to 5 colors

        return branding

    def extract_page_info(self, url: str, soup: Any) -> PageInfo:
        """Extract detailed information from a single page."""
        page_info = PageInfo(url=url, title="")

        if soup is None:
            return page_info

        # Title
        if soup.title:
            page_info.title = soup.title.string or ""

        # Meta description
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc:
            page_info.description = meta_desc.get("content", "")

        # Headings
        for level in range(1, 7):
            for heading in soup.find_all(f"h{level}"):
                text = heading.get_text(strip=True)
                if text:
                    page_info.headings.append(text)

        # Links (internal)
        parsed_base = urlparse(url)
        for link in soup.find_all("a", href=True):
            href = link.get("href")
            full_url = urljoin(url, href)
            parsed_link = urlparse(full_url)
            if parsed_link.netloc == parsed_base.netloc:
                page_info.links.append(full_url)
        page_info.links = list(set(page_info.links))[:100]  # Dedupe and limit

        # Images
        for img in soup.find_all("img", src=True):
            page_info.images.append(urljoin(url, img.get("src")))
        page_info.images = page_info.images[:50]  # Limit

        # Text content
        # Remove script and style elements
        for element in soup(["script", "style", "nav", "footer", "header"]):
            element.decompose()

        page_info.text_content = soup.get_text(separator=" ", strip=True)[:10000]  # Limit
        page_info.word_count = len(page_info.text_content.split())

        return page_info

    def generate_sitemap(self, start_url: str, depth: int = 0, parent_url: str = "") -> List[SiteMapEntry]:
        """
        Recursively generate a site map starting from the given URL.

        Uses breadth-first crawling with depth limiting.
        """
        if depth > self.max_depth or len(self.visited_urls) >= self.max_pages:
            return []

        if start_url in self.visited_urls:
            return []

        self.visited_urls.add(start_url)
        logger.info(f"Mapping: {start_url} (depth: {depth})")

        html = self._fetch_page(start_url)
        if not html:
            return []

        soup = self._parse_html(html)
        if not soup:
            return []

        # Create entry for current page
        title = soup.title.string if soup.title else start_url
        entry = SiteMapEntry(
            url=start_url,
            title=title,
            depth=depth,
            parent_url=parent_url
        )

        entries = [entry]

        # Extract and crawl child links
        parsed_base = urlparse(start_url)
        child_urls = []

        for link in soup.find_all("a", href=True):
            href = link.get("href")
            full_url = urljoin(start_url, href)
            parsed_link = urlparse(full_url)

            # Only follow internal links
            if (parsed_link.netloc == parsed_base.netloc and
                full_url not in self.visited_urls and
                not full_url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip'))):
                child_urls.append(full_url)

        # Limit and dedupe child URLs
        child_urls = list(set(child_urls))[:10]
        entry.children = child_urls

        # Recursively map children
        for child_url in child_urls:
            if len(self.visited_urls) < self.max_pages:
                child_entries = self.generate_sitemap(child_url, depth + 1, start_url)
                entries.extend(child_entries)

        return entries

    def capture_screenshot(self, url: str, output_path: str = None) -> Optional[str]:
        """
        Capture a screenshot of a webpage.

        Note: This is a placeholder. In production, you would use:
        - Playwright
        - Selenium
        - Puppeteer
        - Or a screenshot API service
        """
        logger.info(f"Screenshot capture requested for: {url}")

        if output_path is None:
            url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
            output_path = str(self.knowledge_base_path / f"screenshot_{url_hash}.png")

        # Placeholder - return path where screenshot would be saved
        # In production, integrate with headless browser
        logger.warning("Screenshot capture requires headless browser integration (Playwright/Selenium)")
        return output_path

    def analyze_website(self, url: str) -> WebIntelligenceResult:
        """
        Perform comprehensive analysis of a website.

        This is the main entry point for web intelligence gathering.

        Args:
            url: The URL to analyze

        Returns:
            WebIntelligenceResult containing all gathered intelligence
        """
        logger.info(f"Starting web intelligence analysis for: {url}")

        # Reset state
        self.visited_urls = set()

        # Fetch and parse main page
        html = self._fetch_page(url)
        if not html:
            return WebIntelligenceResult(
                url=url,
                timestamp=datetime.now().isoformat(),
                branding=BrandingInfo(),
                pages=[],
                sitemap=[],
                screenshots=[],
                metadata={"error": "Failed to fetch main page"}
            )

        soup = self._parse_html(html)

        # Extract branding
        branding = self.extract_branding(url, soup)
        logger.info(f"Extracted branding: {branding.site_name}")

        # Extract main page info
        main_page = self.extract_page_info(url, soup)
        pages = [main_page]

        # Generate sitemap
        self.visited_urls = set()  # Reset for sitemap
        sitemap = self.generate_sitemap(url)
        logger.info(f"Generated sitemap with {len(sitemap)} entries")

        # Capture screenshots
        screenshots = []
        screenshot_path = self.capture_screenshot(url)
        if screenshot_path:
            screenshots.append(screenshot_path)

        # Compile metadata
        metadata = {
            "total_pages_crawled": len(sitemap),
            "total_links_found": len(main_page.links),
            "total_images_found": len(main_page.images),
            "word_count": main_page.word_count,
            "crawl_depth": self.max_depth,
            "analysis_time": datetime.now().isoformat()
        }

        result = WebIntelligenceResult(
            url=url,
            timestamp=datetime.now().isoformat(),
            branding=branding,
            pages=pages,
            sitemap=sitemap,
            screenshots=screenshots,
            metadata=metadata
        )

        # Store in knowledge base
        self.store_results(result)

        return result

    def store_results(self, result: WebIntelligenceResult) -> str:
        """
        Store analysis results in the knowledge base.

        Args:
            result: The WebIntelligenceResult to store

        Returns:
            Path to the stored file
        """
        # Create filename from URL
        parsed = urlparse(result.url)
        safe_name = parsed.netloc.replace(".", "_")
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{safe_name}_{timestamp}.json"

        filepath = self.knowledge_base_path / filename

        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(result.to_dict(), f, indent=2, ensure_ascii=False)

        logger.info(f"Results stored at: {filepath}")
        return str(filepath)

    def search_knowledge_base(self, query: str) -> List[Dict]:
        """
        Search the knowledge base for previous analysis results.

        Args:
            query: Search term (matched against site names and URLs)

        Returns:
            List of matching results
        """
        results = []
        query_lower = query.lower()

        for filepath in self.knowledge_base_path.glob("*.json"):
            try:
                with open(filepath, "r", encoding="utf-8") as f:
                    data = json.load(f)

                # Search in URL and branding info
                if (query_lower in data.get("url", "").lower() or
                    query_lower in data.get("branding", {}).get("site_name", "").lower()):
                    results.append({
                        "file": str(filepath),
                        "url": data.get("url"),
                        "site_name": data.get("branding", {}).get("site_name"),
                        "timestamp": data.get("timestamp")
                    })
            except (json.JSONDecodeError, IOError) as e:
                logger.warning(f"Failed to read {filepath}: {e}")

        return results


def main():
    """Main entry point for the web intelligence skill."""
    if len(sys.argv) < 2:
        print(__doc__)
        print("\nUsage: python web_intelligence.py <url>")
        print("\nExample: python web_intelligence.py https://example.com")
        sys.exit(1)

    url = sys.argv[1]

    # Validate URL
    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    # Check dependencies
    if not REQUESTS_AVAILABLE or not BS4_AVAILABLE:
        print("Missing dependencies. Install with:")
        print("  pip install requests beautifulsoup4")
        sys.exit(1)

    # Run analysis
    intel = WebIntelligence()
    result = intel.analyze_website(url)

    # Output results
    print("\n" + "="*60)
    print("WEB INTELLIGENCE REPORT")
    print("="*60)
    print(f"\nURL: {result.url}")
    print(f"Timestamp: {result.timestamp}")
    print(f"\n--- Branding ---")
    print(f"Site Name: {result.branding.site_name}")
    print(f"Tagline: {result.branding.tagline}")
    print(f"Logo URL: {result.branding.logo_url}")
    print(f"Favicon: {result.branding.favicon_url}")
    print(f"Social Links: {result.branding.social_links}")
    print(f"\n--- Site Map ({len(result.sitemap)} pages) ---")
    for entry in result.sitemap[:10]:
        indent = "  " * entry.depth
        print(f"{indent}- {entry.title} ({entry.url})")
    if len(result.sitemap) > 10:
        print(f"  ... and {len(result.sitemap) - 10} more pages")
    print(f"\n--- Metadata ---")
    for key, value in result.metadata.items():
        print(f"{key}: {value}")

    print(f"\nResults stored in knowledge base.")
    return result


if __name__ == "__main__":
    main()
