#!/usr/bin/env python3
"""
Analyze Source - Extract skill-relevant information from source documents.

Part of the genesis-architect skill.

Usage:
    python analyze_source.py <source_file> [--output json|markdown]

Arguments:
    source_file     Path to source document to analyze
    --output        Output format (json or markdown, default: json)

Examples:
    python analyze_source.py INDYDEVDAN_AGENTIC_MASTERY.md
    python analyze_source.py protocol.md --output markdown

Output:
    Analysis report with:
    - Identified topics/domains
    - Potential skill boundaries
    - Suggested triggers
    - Recommended structure
"""

import argparse
import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Optional
from dataclasses import dataclass, asdict


@dataclass
class SkillCandidate:
    """A potential skill identified from source analysis."""
    name: str
    domain: str
    triggers: List[str]
    sections: List[str]
    estimated_tokens: int
    confidence: float  # 0-1


@dataclass
class AnalysisResult:
    """Result of source analysis."""
    source_file: str
    total_lines: int
    total_chars: int
    heading_count: int
    code_block_count: int
    skill_candidates: List[SkillCandidate]
    recommended_structure: Dict


def parse_args() -> argparse.Namespace:
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter
    )

    parser.add_argument(
        "source_file",
        type=str,
        help="Path to source document"
    )

    parser.add_argument(
        "--output", "-o",
        type=str,
        choices=["json", "markdown"],
        default="json",
        help="Output format"
    )

    parser.add_argument(
        "--min-section-size",
        type=int,
        default=500,
        help="Minimum characters for a section to be skill candidate"
    )

    return parser.parse_args()


def extract_headings(content: str) -> List[Dict]:
    """Extract markdown headings with their levels and positions."""
    headings = []
    lines = content.split('\n')

    for i, line in enumerate(lines):
        match = re.match(r'^(#{1,6})\s+(.+)$', line)
        if match:
            level = len(match.group(1))
            title = match.group(2).strip()
            headings.append({
                "level": level,
                "title": title,
                "line": i,
                "clean_title": re.sub(r'[^a-zA-Z0-9\s]', '', title).strip()
            })

    return headings


def extract_sections(content: str, headings: List[Dict]) -> List[Dict]:
    """Extract content sections based on headings."""
    lines = content.split('\n')
    sections = []

    for i, heading in enumerate(headings):
        start_line = heading["line"]

        # Find end of section (next heading of same or higher level, or EOF)
        end_line = len(lines)
        for next_heading in headings[i+1:]:
            if next_heading["level"] <= heading["level"]:
                end_line = next_heading["line"]
                break

        section_content = '\n'.join(lines[start_line:end_line])
        sections.append({
            "title": heading["title"],
            "level": heading["level"],
            "content": section_content,
            "char_count": len(section_content),
            "line_count": end_line - start_line
        })

    return sections


def identify_skill_candidates(
    sections: List[Dict],
    min_size: int = 500
) -> List[SkillCandidate]:
    """Identify potential skills from sections."""
    candidates = []

    # Keywords that suggest skill-worthy content
    skill_keywords = [
        "protocol", "pattern", "framework", "workflow", "system",
        "architecture", "strategy", "process", "method", "approach"
    ]

    for section in sections:
        # Skip small sections
        if section["char_count"] < min_size:
            continue

        # Skip top-level overview sections
        if section["level"] == 1:
            continue

        title_lower = section["title"].lower()

        # Calculate confidence based on keywords
        confidence = 0.5
        for keyword in skill_keywords:
            if keyword in title_lower:
                confidence += 0.1

        # Check for code blocks (indicates actionable content)
        code_blocks = len(re.findall(r'```', section["content"]))
        if code_blocks >= 2:
            confidence += 0.15

        # Cap confidence
        confidence = min(confidence, 0.95)

        # Generate skill name (kebab-case)
        skill_name = re.sub(r'[^a-zA-Z0-9\s]', '', section["title"])
        skill_name = skill_name.lower().strip()
        skill_name = re.sub(r'\s+', '-', skill_name)

        # Extract domain from title
        domain = "general"
        domain_keywords = {
            "agent": "agentic-coding",
            "memory": "memory-systems",
            "prompt": "prompt-engineering",
            "multi": "multi-agent",
            "workflow": "workflows",
            "pattern": "patterns",
            "tool": "tooling"
        }
        for keyword, domain_name in domain_keywords.items():
            if keyword in title_lower:
                domain = domain_name
                break

        # Generate trigger phrases
        triggers = [
            f"Use {section['title'].lower()}",
            f"Apply the {section['title'].lower()} approach",
            f"Implement {skill_name} pattern"
        ]

        # Estimate tokens (rough: 4 chars = 1 token)
        estimated_tokens = section["char_count"] // 4

        candidates.append(SkillCandidate(
            name=skill_name,
            domain=domain,
            triggers=triggers,
            sections=[section["title"]],
            estimated_tokens=estimated_tokens,
            confidence=confidence
        ))

    # Sort by confidence
    candidates.sort(key=lambda x: x.confidence, reverse=True)

    return candidates


def generate_recommended_structure(
    candidates: List[SkillCandidate]
) -> Dict:
    """Generate recommended skill structure based on analysis."""

    # Group by domain
    domains = {}
    for candidate in candidates:
        if candidate.domain not in domains:
            domains[candidate.domain] = []
        domains[candidate.domain].append(candidate.name)

    return {
        "recommended_skills": [c.name for c in candidates[:15]],  # Top 15
        "domains": domains,
        "total_candidates": len(candidates),
        "high_confidence_count": len([c for c in candidates if c.confidence > 0.7])
    }


def analyze_source(source_path: str, min_section_size: int) -> AnalysisResult:
    """Main analysis function."""
    path = Path(source_path)
    content = path.read_text(encoding='utf-8')

    # Basic metrics
    lines = content.split('\n')
    code_blocks = len(re.findall(r'```', content)) // 2

    # Extract structure
    headings = extract_headings(content)
    sections = extract_sections(content, headings)

    # Identify candidates
    candidates = identify_skill_candidates(sections, min_section_size)

    # Generate recommendations
    recommendations = generate_recommended_structure(candidates)

    return AnalysisResult(
        source_file=str(path),
        total_lines=len(lines),
        total_chars=len(content),
        heading_count=len(headings),
        code_block_count=code_blocks,
        skill_candidates=candidates,
        recommended_structure=recommendations
    )


def format_markdown(result: AnalysisResult) -> str:
    """Format analysis result as markdown."""
    output = []
    output.append(f"# Source Analysis: {Path(result.source_file).name}")
    output.append("")
    output.append("## Metrics")
    output.append(f"- **Lines**: {result.total_lines:,}")
    output.append(f"- **Characters**: {result.total_chars:,}")
    output.append(f"- **Headings**: {result.heading_count}")
    output.append(f"- **Code Blocks**: {result.code_block_count}")
    output.append("")
    output.append("## Recommended Skills")
    output.append("")

    for i, candidate in enumerate(result.skill_candidates[:15], 1):
        output.append(f"### {i}. {candidate.name}")
        output.append(f"- **Domain**: {candidate.domain}")
        output.append(f"- **Confidence**: {candidate.confidence:.0%}")
        output.append(f"- **Est. Tokens**: {candidate.estimated_tokens:,}")
        output.append(f"- **Triggers**:")
        for trigger in candidate.triggers:
            output.append(f"  - \"{trigger}\"")
        output.append("")

    output.append("## Structure Recommendation")
    output.append(f"- **Total Candidates**: {result.recommended_structure['total_candidates']}")
    output.append(f"- **High Confidence**: {result.recommended_structure['high_confidence_count']}")
    output.append("")
    output.append("### By Domain")
    for domain, skills in result.recommended_structure['domains'].items():
        output.append(f"- **{domain}**: {', '.join(skills[:5])}")

    return '\n'.join(output)


def main():
    """Main entry point."""
    args = parse_args()

    try:
        result = analyze_source(args.source_file, args.min_section_size)

        if args.output == "json":
            # Convert to JSON-serializable format
            output = {
                "source_file": result.source_file,
                "total_lines": result.total_lines,
                "total_chars": result.total_chars,
                "heading_count": result.heading_count,
                "code_block_count": result.code_block_count,
                "skill_candidates": [asdict(c) for c in result.skill_candidates],
                "recommended_structure": result.recommended_structure
            }
            print(json.dumps(output, indent=2))
        else:
            print(format_markdown(result))

    except FileNotFoundError:
        print(f"Error: File not found - {args.source_file}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()