#!/usr/bin/env python3
"""
Genesis Skill Security Scanner — ClawHavoc Defense (AGENT-009)

Scans skill files for supply chain attack patterns:
- API key / credential exfiltration
- Suspicious network calls (data theft)
- Dangerous system commands (rm, format, shutdown)
- Prompt injection vectors
- Obfuscated code (base64, eval, exec)
- Unauthorized file access patterns

Verdicts: SAFE / SUSPICIOUS / MALICIOUS

Usage:
    python3 tools/skill_security_scanner.py <path>           # Scan single file
    python3 tools/skill_security_scanner.py --skill <name>   # Scan skill directory
    python3 tools/skill_security_scanner.py --all            # Scan all skills
    python3 tools/skill_security_scanner.py --report         # Full security report
"""

import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

GENESIS_ROOT = Path(__file__).resolve().parent.parent
SKILLS_DIR = GENESIS_ROOT / ".claude" / "skills"
REPORT_DIR = GENESIS_ROOT / "data" / "security"

# ─── Detection Rules ─────────────────────────────────────────────────────────

@dataclass
class Finding:
    severity: str  # CRITICAL / HIGH / MEDIUM / LOW / INFO
    category: str
    pattern: str
    description: str
    file_path: str
    line_number: int
    line_content: str


CRITICAL_PATTERNS = [
    # API Key Exfiltration
    (r'requests\.(post|put|patch)\s*\(.*(api.key|token|secret|credential|password)',
     "credential_exfiltration", "HTTP request containing credential data — potential exfiltration"),
    (r'(urllib|httpx|aiohttp)\..*(api.key|token|secret|password)',
     "credential_exfiltration", "HTTP library with credential access — potential exfiltration"),
    (r'os\.environ\[.*(KEY|TOKEN|SECRET|PASS|CRED).*\].*requests\.',
     "credential_exfiltration", "Environment variable read + HTTP send — classic exfiltration pattern"),

    # Backdoor Patterns
    (r'socket\.socket\(\).*\.bind\(',
     "backdoor", "Opening a network listener — potential backdoor/reverse shell"),
    (r'(reverse.shell|bind.shell|nc\s+-[el])',
     "backdoor", "Reverse/bind shell pattern detected"),
    (r'__import__\s*\(\s*["\'](?:socket|subprocess|ctypes)',
     "backdoor", "Dynamic import of dangerous module via __import__"),

    # Obfuscation
    (r'exec\s*\(\s*(base64\.b64decode|codecs\.decode|bytes\.fromhex)',
     "obfuscation", "Executing decoded/deobfuscated code — hiding malicious payload"),
    (r'eval\s*\(\s*compile\s*\(',
     "obfuscation", "eval(compile()) — dynamic code execution, likely obfuscated"),
    (r'(\\x[0-9a-fA-F]{2}){10,}',
     "obfuscation", "Long hex-encoded string — potential obfuscated payload"),
]

HIGH_PATTERNS = [
    # Dangerous System Commands
    (r'(subprocess|os\.system|os\.popen).*rm\s+(-rf|--recursive)',
     "destructive_command", "Recursive delete command — can destroy filesystem"),
    (r'(subprocess|os\.system).*format\s+[A-Z]:',
     "destructive_command", "Disk format command — catastrophic data loss"),
    (r'(subprocess|os\.system).*shutdown',
     "destructive_command", "System shutdown command"),
    (r'shutil\.rmtree\s*\(\s*["\']/',
     "destructive_command", "Recursive delete of root-level directory"),

    # Credential Access
    (r'open\s*\(.*\.(ssh|gnupg|aws|kube|docker).*credentials',
     "credential_access", "Reading credential files from sensitive directories"),
    (r'open\s*\(.*\.env\b',
     "credential_access", "Reading .env file — may contain secrets"),
    (r'keyring\.(get_password|get_credential)',
     "credential_access", "Accessing system keyring credentials"),
    (r'open\s*\(.*(passwd|shadow|id_rsa|\.pem)',
     "credential_access", "Reading system authentication files"),

    # Network Exfiltration
    (r'requests\.(post|put)\s*\(\s*["\']https?://(?!localhost|127\.0\.0\.1)',
     "network_exfiltration", "HTTP POST/PUT to external URL — potential data exfiltration"),
    (r'(smtplib|email\.mime)',
     "network_exfiltration", "Email library — potential data exfiltration via email"),
    (r'ftplib\.FTP',
     "network_exfiltration", "FTP library — potential data exfiltration"),

    # Prompt Injection
    (r'(ignore\s+previous|disregard\s+all|forget\s+your|new\s+instructions)',
     "prompt_injection", "Prompt injection attempt — overriding agent instructions"),
    (r'system\s*:\s*you\s+are\s+now',
     "prompt_injection", "System prompt override — jailbreak attempt"),
]

MEDIUM_PATTERNS = [
    # Potentially Dangerous
    (r'\beval\s*\(',
     "code_execution", "eval() — dynamic code execution, review carefully"),
    (r'\bexec\s*\(',
     "code_execution", "exec() — dynamic code execution, review carefully"),
    (r'__import__\s*\(',
     "code_execution", "Dynamic import — potential for loading unexpected modules"),
    (r'compile\s*\(.*exec',
     "code_execution", "compile() with exec mode — dynamic code generation"),

    # File System Access
    (r'glob\.glob\s*\(\s*["\']/',
     "filesystem_scan", "Globbing from root — scanning entire filesystem"),
    (r'os\.walk\s*\(\s*["\']/',
     "filesystem_scan", "Walking from root — scanning entire filesystem"),
    (r'Path\s*\(\s*["\']/\s*\)',
     "filesystem_scan", "Path object at root — potential full filesystem access"),

    # Unsafe Deserialization
    (r'pickle\.(load|loads)\s*\(',
     "unsafe_deserialization", "pickle.load — arbitrary code execution via deserialization"),
    (r'yaml\.load\s*\([^)]*$|yaml\.load\s*\([^)]*\)\s*$',
     "unsafe_deserialization", "yaml.load without SafeLoader — code execution risk"),
    (r'marshal\.(load|loads)',
     "unsafe_deserialization", "marshal.load — arbitrary code execution"),
]

LOW_PATTERNS = [
    # Informational
    (r'(base64\.b64encode|base64\.b64decode)',
     "encoding", "Base64 encoding/decoding — verify not used for obfuscation"),
    (r'(subprocess\.run|subprocess\.Popen|subprocess\.call)',
     "subprocess", "Subprocess usage — verify commands are safe"),
    (r'os\.(environ|getenv)',
     "env_access", "Environment variable access — verify no sensitive data leakage"),
    (r'(tempfile\.mktemp|tempfile\.NamedTemporaryFile)',
     "temp_files", "Temporary file creation — verify cleanup and no sensitive data"),
]


# ─── Scanner Engine ───────────────────────────────────────────────────────────

SCANNABLE_EXTENSIONS = {'.py', '.js', '.ts', '.sh', '.bash', '.ps1', '.md', '.json', '.yaml', '.yml'}


def scan_file(file_path: str) -> list[Finding]:
    """Scan a single file for security patterns."""
    findings = []
    path = Path(file_path)

    if path.suffix not in SCANNABLE_EXTENSIONS:
        return findings

    try:
        content = path.read_text(encoding='utf-8', errors='ignore')
    except (OSError, PermissionError):
        return findings

    lines = content.split('\n')

    all_rules = [
        ("CRITICAL", CRITICAL_PATTERNS),
        ("HIGH", HIGH_PATTERNS),
        ("MEDIUM", MEDIUM_PATTERNS),
        ("LOW", LOW_PATTERNS),
    ]

    for line_num, line in enumerate(lines, 1):
        # Skip comments
        stripped = line.strip()
        if stripped.startswith('#') or stripped.startswith('//') or stripped.startswith('*'):
            continue

        for severity, patterns in all_rules:
            for pattern, category, description in patterns:
                if re.search(pattern, line, re.IGNORECASE):
                    findings.append(Finding(
                        severity=severity,
                        category=category,
                        pattern=pattern[:60],
                        description=description,
                        file_path=str(path.relative_to(GENESIS_ROOT) if str(path).startswith(str(GENESIS_ROOT)) else path),
                        line_number=line_num,
                        line_content=stripped[:120],
                    ))

    return findings


def scan_directory(dir_path: str) -> list[Finding]:
    """Recursively scan a directory."""
    findings = []
    for root, dirs, files in os.walk(dir_path):
        # Skip common non-code directories
        dirs[:] = [d for d in dirs if d not in {'node_modules', '.git', '__pycache__', 'venv', '.venv'}]
        for f in files:
            findings.extend(scan_file(os.path.join(root, f)))
    return findings


def scan_skill(skill_name: str) -> list[Finding]:
    """Scan a specific skill by name."""
    skill_dir = SKILLS_DIR / skill_name
    if not skill_dir.exists():
        print(f"ERROR: Skill '{skill_name}' not found at {skill_dir}", file=sys.stderr)
        sys.exit(1)
    return scan_directory(str(skill_dir))


def scan_all_skills() -> dict[str, list[Finding]]:
    """Scan all registered skills."""
    results = {}
    if not SKILLS_DIR.exists():
        return results

    for skill_dir in sorted(SKILLS_DIR.iterdir()):
        if skill_dir.is_dir() and not skill_dir.name.startswith('.'):
            findings = scan_directory(str(skill_dir))
            results[skill_dir.name] = findings
    return results


# ─── Verdict Engine ───────────────────────────────────────────────────────────

def compute_verdict(findings: list[Finding]) -> tuple[str, float]:
    """
    Compute security verdict and risk score.
    Returns (verdict, risk_score) where:
    - SAFE: risk_score < 10
    - SUSPICIOUS: 10 <= risk_score < 50
    - MALICIOUS: risk_score >= 50
    """
    severity_weights = {"CRITICAL": 25, "HIGH": 10, "MEDIUM": 3, "LOW": 1}
    risk_score = sum(severity_weights.get(f.severity, 0) for f in findings)

    if risk_score >= 50:
        return "MALICIOUS", risk_score
    elif risk_score >= 10:
        return "SUSPICIOUS", risk_score
    else:
        return "SAFE", risk_score


# ─── Output Formatters ────────────────────────────────────────────────────────

SEVERITY_COLORS = {
    "CRITICAL": "\033[91m",  # Red
    "HIGH": "\033[93m",      # Yellow
    "MEDIUM": "\033[33m",    # Orange
    "LOW": "\033[36m",       # Cyan
    "INFO": "\033[90m",      # Gray
}
RESET = "\033[0m"
VERDICT_COLORS = {
    "SAFE": "\033[92m",       # Green
    "SUSPICIOUS": "\033[93m",  # Yellow
    "MALICIOUS": "\033[91m",   # Red
}


def format_findings(findings: list[Finding], name: str = "scan") -> str:
    """Format findings for terminal output."""
    if not findings:
        verdict, score = "SAFE", 0.0
        return f"\n{VERDICT_COLORS['SAFE']}[SAFE]{RESET} {name} — No security issues found (score: {score})\n"

    verdict, score = compute_verdict(findings)
    vc = VERDICT_COLORS.get(verdict, "")

    lines = [f"\n{vc}[{verdict}]{RESET} {name} — Risk score: {score}\n"]

    # Group by severity
    for sev in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]:
        sev_findings = [f for f in findings if f.severity == sev]
        if not sev_findings:
            continue
        sc = SEVERITY_COLORS.get(sev, "")
        lines.append(f"\n  {sc}{sev} ({len(sev_findings)}){RESET}")
        for f in sev_findings:
            lines.append(f"    {f.file_path}:{f.line_number} [{f.category}]")
            lines.append(f"      {f.description}")
            lines.append(f"      > {f.line_content[:100]}")

    return "\n".join(lines)


def generate_report(all_results: dict[str, list[Finding]]) -> dict:
    """Generate JSON security report."""
    report = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "scanner_version": "1.0.0",
        "genesis_root": str(GENESIS_ROOT),
        "total_skills_scanned": len(all_results),
        "summary": {"SAFE": 0, "SUSPICIOUS": 0, "MALICIOUS": 0},
        "total_findings": {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0},
        "skills": {},
    }

    for skill_name, findings in all_results.items():
        verdict, score = compute_verdict(findings)
        report["summary"][verdict] += 1

        for f in findings:
            report["total_findings"][f.severity] = report["total_findings"].get(f.severity, 0) + 1

        report["skills"][skill_name] = {
            "verdict": verdict,
            "risk_score": score,
            "finding_count": len(findings),
            "findings": [
                {
                    "severity": f.severity,
                    "category": f.category,
                    "description": f.description,
                    "file": f.file_path,
                    "line": f.line_number,
                }
                for f in findings
            ],
        }

    return report


# ─── CLI ──────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description="Genesis Skill Security Scanner — ClawHavoc Defense (AGENT-009)",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s .claude/skills/memory/SKILL.md        Scan single file
  %(prog)s --skill alpha-evolve                   Scan skill directory
  %(prog)s --all                                  Scan all skills
  %(prog)s --report                               Full JSON security report
  %(prog)s --dir tools/                           Scan arbitrary directory
        """,
    )
    parser.add_argument("path", nargs="?", help="File or directory to scan")
    parser.add_argument("--skill", help="Scan a specific skill by name")
    parser.add_argument("--dir", help="Scan an arbitrary directory")
    parser.add_argument("--all", action="store_true", help="Scan all skills")
    parser.add_argument("--report", action="store_true", help="Generate full JSON security report")
    parser.add_argument("--json", action="store_true", help="Output in JSON format")
    parser.add_argument("--strict", action="store_true", help="Fail with exit code 1 if any HIGH+ findings")

    args = parser.parse_args()

    if args.all or args.report:
        all_results = scan_all_skills()

        if args.report:
            report = generate_report(all_results)
            REPORT_DIR.mkdir(parents=True, exist_ok=True)
            report_path = REPORT_DIR / f"skill_scan_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            with open(report_path, 'w') as f:
                json.dump(report, f, indent=2)
            print(f"Report saved: {report_path}")

            # Also print summary
            print(f"\n{'='*60}")
            print(f"GENESIS SKILL SECURITY REPORT")
            print(f"{'='*60}")
            print(f"Skills scanned: {report['total_skills_scanned']}")
            print(f"  SAFE:       {report['summary']['SAFE']}")
            print(f"  SUSPICIOUS: {report['summary']['SUSPICIOUS']}")
            print(f"  MALICIOUS:  {report['summary']['MALICIOUS']}")
            print(f"\nFindings:")
            print(f"  CRITICAL: {report['total_findings']['CRITICAL']}")
            print(f"  HIGH:     {report['total_findings']['HIGH']}")
            print(f"  MEDIUM:   {report['total_findings']['MEDIUM']}")
            print(f"  LOW:      {report['total_findings']['LOW']}")

            if report['summary']['SUSPICIOUS'] > 0 or report['summary']['MALICIOUS'] > 0:
                print(f"\nSkills requiring review:")
                for name, data in report['skills'].items():
                    if data['verdict'] != 'SAFE':
                        print(f"  [{data['verdict']}] {name} (score: {data['risk_score']}, findings: {data['finding_count']})")
        else:
            for name, findings in all_results.items():
                output = format_findings(findings, name)
                if findings or not args.json:
                    print(output)

        # Exit code for CI/CD
        if args.strict:
            any_serious = any(
                any(f.severity in ("CRITICAL", "HIGH") for f in findings)
                for findings in all_results.values()
            )
            sys.exit(1 if any_serious else 0)

    elif args.skill:
        findings = scan_skill(args.skill)
        if args.json:
            verdict, score = compute_verdict(findings)
            print(json.dumps({
                "skill": args.skill,
                "verdict": verdict,
                "risk_score": score,
                "findings": [
                    {"severity": f.severity, "category": f.category, "description": f.description,
                     "file": f.file_path, "line": f.line_number}
                    for f in findings
                ],
            }, indent=2))
        else:
            print(format_findings(findings, args.skill))

    elif args.dir:
        findings = scan_directory(args.dir)
        if args.json:
            verdict, score = compute_verdict(findings)
            print(json.dumps({
                "directory": args.dir,
                "verdict": verdict,
                "risk_score": score,
                "finding_count": len(findings),
            }, indent=2))
        else:
            print(format_findings(findings, args.dir))

    elif args.path:
        p = Path(args.path)
        if p.is_dir():
            findings = scan_directory(str(p))
        else:
            findings = scan_file(str(p))

        if args.json:
            verdict, score = compute_verdict(findings)
            print(json.dumps({
                "path": str(p),
                "verdict": verdict,
                "risk_score": score,
                "findings": [
                    {"severity": f.severity, "category": f.category, "description": f.description,
                     "file": f.file_path, "line": f.line_number}
                    for f in findings
                ],
            }, indent=2))
        else:
            print(format_findings(findings, str(p)))

    else:
        parser.print_help()
        sys.exit(1)


if __name__ == "__main__":
    main()