#!/usr/bin/env python3
"""
TITAN FILE SCANNER
==================
Scans target directories for Python files to include in cache.

VERIFICATION_STAMP
Story: TITAN-001 (File Scanner)
Verified By: Claude
Verified At: 2026-01-23
"""

import os
import fnmatch
from pathlib import Path
from typing import List, Optional
import logging

logger = logging.getLogger(__name__)

# Genesis root directory
GENESIS_ROOT = Path(__file__).parent.parent.parent


class FileScanner:
    """
    Scans directories for Python files to cache.

    Default targets: core/, skills/, tools/, rlm/, swarms/
    """

    # Default directories for full-stack scan
    DEFAULT_DIRECTORIES = ['core', 'skills', 'tools', 'rlm', 'swarms']

    # Default exclude patterns
    DEFAULT_EXCLUDES = [
        '__pycache__',
        '*.pyc',
        '.git',
        '.venv',
        'venv',
        'node_modules',
        'archives',
        '.cache',
    ]

    def __init__(
        self,
        directories: Optional[List[str]] = None,
        exclude_patterns: Optional[List[str]] = None,
        root_path: Optional[Path] = None,
        follow_symlinks: bool = False,
        max_file_size: int = 1_000_000,  # 1MB max per file
    ):
        """
        Initialize the file scanner.

        Args:
            directories: List of directory names to scan (relative to root)
            exclude_patterns: Glob patterns to exclude
            root_path: Root path (defaults to GENESIS_ROOT)
            follow_symlinks: Whether to follow symbolic links
            max_file_size: Maximum file size in bytes to include
        """
        self.directories = directories or self.DEFAULT_DIRECTORIES
        self.exclude_patterns = exclude_patterns or self.DEFAULT_EXCLUDES
        self.root_path = root_path or GENESIS_ROOT
        self.follow_symlinks = follow_symlinks
        self.max_file_size = max_file_size

    def scan(self) -> List[Path]:
        """
        Scan all target directories for Python files.

        Returns:
            List of Path objects for each Python file found
        """
        all_files = []

        for dir_name in self.directories:
            dir_path = self.root_path / dir_name
            if dir_path.exists() and dir_path.is_dir():
                files = self._scan_directory(dir_path)
                all_files.extend(files)
            else:
                logger.warning(f"Directory not found: {dir_path}")

        # Remove duplicates and sort
        unique_files = list(set(all_files))
        unique_files.sort()

        logger.info(f"Scanned {len(self.directories)} directories, found {len(unique_files)} Python files")
        return unique_files

    def _scan_directory(self, directory: Path) -> List[Path]:
        """
        Recursively scan a single directory for Python files.

        Args:
            directory: Directory path to scan

        Returns:
            List of Python file paths
        """
        files = []

        try:
            for item in directory.rglob('*.py'):
                # Skip if matches exclude pattern
                if self._should_exclude(item):
                    continue

                # Skip symlinks unless configured to follow
                if item.is_symlink() and not self.follow_symlinks:
                    continue

                # Skip files that are too large
                try:
                    if item.stat().st_size > self.max_file_size:
                        logger.debug(f"Skipping large file: {item}")
                        continue
                except OSError:
                    continue

                # Skip files we can't read
                if not self._is_readable(item):
                    continue

                files.append(item)

        except PermissionError as e:
            logger.warning(f"Permission denied: {directory}")
        except Exception as e:
            logger.error(f"Error scanning {directory}: {e}")

        return files

    def _should_exclude(self, path: Path) -> bool:
        """
        Check if a path matches any exclude pattern.

        Args:
            path: Path to check

        Returns:
            True if path should be excluded
        """
        path_str = str(path)

        for pattern in self.exclude_patterns:
            # Check directory components
            if pattern in path_str:
                return True
            # Check filename pattern
            if fnmatch.fnmatch(path.name, pattern):
                return True

        return False

    def _is_readable(self, path: Path) -> bool:
        """
        Check if a file is readable.

        Args:
            path: File path

        Returns:
            True if file can be read
        """
        try:
            with open(path, 'r', encoding='utf-8') as f:
                f.read(100)  # Try reading first 100 chars
            return True
        except:
            return False

    def _estimate_tokens(self, content: str) -> int:
        """
        Estimate token count for content.

        Rough estimate: ~4 characters per token for code.

        Args:
            content: Text content

        Returns:
            Estimated token count
        """
        return len(content) // 4

    def get_total_size(self, files: Optional[List[Path]] = None) -> int:
        """
        Get total size of files in bytes.

        Args:
            files: List of files (or scan if None)

        Returns:
            Total size in bytes
        """
        if files is None:
            files = self.scan()

        total = 0
        for f in files:
            try:
                total += f.stat().st_size
            except:
                pass
        return total

    def get_estimated_tokens(self, files: Optional[List[Path]] = None) -> int:
        """
        Estimate total token count for files.

        Args:
            files: List of files (or scan if None)

        Returns:
            Estimated total tokens
        """
        if files is None:
            files = self.scan()

        total_chars = 0
        for f in files:
            try:
                total_chars += f.stat().st_size
            except:
                pass

        return total_chars // 4


if __name__ == '__main__':
    # Quick test
    logging.basicConfig(level=logging.INFO)

    scanner = FileScanner()
    files = scanner.scan()

    print(f"\nFound {len(files)} Python files")
    print(f"Total size: {scanner.get_total_size(files) / 1024 / 1024:.2f} MB")
    print(f"Estimated tokens: {scanner.get_estimated_tokens(files):,}")

    print("\nSample files:")
    for f in files[:10]:
        print(f"  {f}")
