#!/usr/bin/env python3
"""
CSS Selector Sanitizer (UVS-H01)
================================
Prevents XSS injection via CSS selector arguments.

Security Features:
- CSS.escape() equivalent for Python
- Blocklist for dangerous patterns (JS-like syntax)
- Allowlist validation for trusted selectors
- Logging of blocked attempts

VERIFICATION_STAMP
Story: UVS-H01
Verified By: Claude Opus 4.5
Verified At: 2026-02-03
Tests: See verification/test_selector_sanitizer.py
"""

import re
import logging
from typing import Optional, Tuple

logger = logging.getLogger(__name__)

class SelectorValidationError(Exception):
    """Raised when a CSS selector fails validation."""
    pass


# OWASP-based dangerous patterns that could indicate XSS attempts
DANGEROUS_PATTERNS = [
    r'javascript\s*:',           # javascript: protocol
    r'data\s*:',                 # data: protocol
    r'vbscript\s*:',             # vbscript: protocol
    r'on\w+\s*=',                # Event handlers (onclick, onerror, etc.)
    r'<\s*script',               # Script tags
    r'<\s*/\s*script',           # Closing script tags
    r'expression\s*\(',          # CSS expression()
    r'url\s*\(\s*["\']?\s*javascript', # url(javascript:...)
    r'\\u00[0-9a-fA-F]{2}',      # Unicode escapes that could hide JS
    r'eval\s*\(',                # eval()
    r'alert\s*\(',               # alert()
    r'prompt\s*\(',              # prompt()
    r'confirm\s*\(',             # confirm()
    r'document\s*\.',            # document.* access
    r'window\s*\.',              # window.* access
    r'fetch\s*\(',               # fetch()
    r'XMLHttpRequest',           # XHR
    r'\.innerHTML',              # innerHTML manipulation
    r'\.outerHTML',              # outerHTML manipulation
    r'\.textContent\s*=',        # textContent assignment
    r'Function\s*\(',            # Function constructor
    r'setTimeout\s*\(',          # setTimeout
    r'setInterval\s*\(',         # setInterval
    r'import\s*\(',              # Dynamic import
    r'require\s*\(',             # CommonJS require
    r'\}\s*\)',                  # Potential injection breakout '})'
    r'\)\s*\{',                  # Potential injection breakout '){'
    r';\s*[a-zA-Z]',             # Statement injection '; code'
]

# Compile patterns for efficiency
_dangerous_re = re.compile('|'.join(DANGEROUS_PATTERNS), re.IGNORECASE)


def css_escape(value: str) -> str:
    """
    Python equivalent of CSS.escape() from the CSSOM spec.

    Escapes a string for safe use in CSS selectors.
    Based on https://drafts.csswg.org/cssom/#serialize-an-identifier

    Args:
        value: The string to escape

    Returns:
        Escaped string safe for CSS selector use
    """
    if not value:
        return ''

    result = []
    for i, char in enumerate(value):
        code = ord(char)

        # Null bytes become replacement character
        if code == 0:
            result.append('\uFFFD')
            continue

        # Control characters (U+0001 to U+001F, U+007F)
        if (0x0001 <= code <= 0x001F) or code == 0x007F:
            result.append(f'\\{code:x} ')
            continue

        # First character special handling
        if i == 0:
            # Digits at start must be escaped
            if 0x0030 <= code <= 0x0039:  # 0-9
                result.append(f'\\{code:x} ')
                continue
            # Hyphen followed by digit or hyphen at start
            if code == 0x002D:  # -
                if len(value) == 1:
                    result.append('\\-')
                    continue
                next_code = ord(value[1]) if len(value) > 1 else 0
                if (0x0030 <= next_code <= 0x0039) or next_code == 0x002D:
                    result.append('\\-')
                    continue

        # Second character after hyphen
        if i == 1 and value[0] == '-':
            if 0x0030 <= code <= 0x0039:  # 0-9
                result.append(f'\\{code:x} ')
                continue

        # Characters that need escaping
        # Not: alphanumeric, hyphen (except start cases), underscore, or >= U+0080
        if not (
            (0x0030 <= code <= 0x0039) or  # 0-9
            (0x0041 <= code <= 0x005A) or  # A-Z
            (0x0061 <= code <= 0x007A) or  # a-z
            code == 0x005F or              # _
            code == 0x002D or              # -
            code >= 0x0080                  # Non-ASCII
        ):
            result.append(f'\\{char}')
            continue

        result.append(char)

    return ''.join(result)


def detect_dangerous_patterns(selector: str) -> Optional[str]:
    """
    Check selector for dangerous patterns that could indicate XSS.

    Args:
        selector: CSS selector to check

    Returns:
        The matched dangerous pattern if found, None if safe
    """
    match = _dangerous_re.search(selector)
    if match:
        return match.group()
    return None


def sanitize_selector(selector: str, allow_attribute_selectors: bool = True) -> str:
    """
    Sanitize a CSS selector for safe use in evaluate() calls.

    This function:
    1. Detects and blocks dangerous XSS patterns
    2. Escapes special characters
    3. Validates structure

    Args:
        selector: The CSS selector to sanitize
        allow_attribute_selectors: Whether to allow [attr="value"] syntax

    Returns:
        Sanitized selector safe for use

    Raises:
        SelectorValidationError: If selector contains dangerous patterns
    """
    if not selector:
        raise SelectorValidationError("Empty selector not allowed")

    if len(selector) > 1000:
        raise SelectorValidationError(f"Selector too long: {len(selector)} chars (max 1000)")

    # Check for dangerous patterns
    dangerous = detect_dangerous_patterns(selector)
    if dangerous:
        logger.warning(f"[SECURITY] Blocked dangerous selector pattern: {dangerous!r} in selector: {selector[:100]!r}")
        raise SelectorValidationError(f"Dangerous pattern detected: {dangerous!r}")

    # If attribute selectors disallowed, check for them
    if not allow_attribute_selectors and '[' in selector:
        # Check if it's actually an attribute selector
        if re.search(r'\[[^\]]+\]', selector):
            logger.warning(f"[SECURITY] Blocked attribute selector: {selector[:100]!r}")
            raise SelectorValidationError("Attribute selectors not allowed in this context")

    # The selector passes validation - return as-is for querySelector
    # (We don't escape the whole selector as that would break valid CSS syntax)
    return selector


def validate_selector(selector: str, allowlist: Optional[list] = None) -> Tuple[bool, str]:
    """
    Validate a CSS selector against security requirements.

    Args:
        selector: Selector to validate
        allowlist: Optional list of allowed selector patterns (regex)

    Returns:
        Tuple of (is_valid, reason)
    """
    try:
        sanitize_selector(selector)
    except SelectorValidationError as e:
        return False, str(e)

    # Check against allowlist if provided
    if allowlist is not None:
        matched = False
        for pattern in allowlist:
            if re.match(pattern, selector):
                matched = True
                break
        if not matched:
            return False, f"Selector does not match any allowed pattern"

    return True, "Valid"


def escape_selector_value(value: str) -> str:
    """
    Escape a value for safe interpolation into a CSS selector.

    Use this when building selectors dynamically:
        selector = f'[data-id="{escape_selector_value(user_input)}"]'

    Args:
        value: The value to escape

    Returns:
        Escaped value safe for selector interpolation
    """
    # Double-escape quotes and backslashes for use in attribute selectors
    value = value.replace('\\', '\\\\')
    value = value.replace('"', '\\"')
    value = value.replace("'", "\\'")

    # Also apply CSS escape for any special characters
    return css_escape(value)


# Commonly used safe selectors for GHL operations
GHL_SELECTOR_ALLOWLIST = [
    r'^#[\w-]+$',                          # ID selectors
    r'^\.[\w-]+$',                          # Class selectors
    r'^[\w-]+$',                            # Tag selectors
    r'^input\[name=["\'][\w-]+["\']\]$',   # Input by name
    r'^button\[data-[\w-]+=["\'][\w-]+["\']\]$',  # Buttons by data attr
    r'^\[data-testid=["\'][\w-]+["\']\]$', # Test ID selectors
]
