#!/usr/bin/env python3
"""
Component Interface Contracts
==============================
Shared dataclasses and Protocol classes for the KB ingestion pipeline.
Every module MUST use these types for inter-module communication.
"""

from typing import Protocol, List, Dict, Any, Optional
from dataclasses import dataclass, field


# ──────────────────────────────────────────────────────────────────────
# Dataclasses — shared data structures
# ──────────────────────────────────────────────────────────────────────

@dataclass
class FetchedPage:
    """Output of the Fetcher module."""
    url: str
    html: str
    status_code: int
    content_type: str
    headers: Dict[str, str]
    fetched_at: str  # ISO 8601


@dataclass
class ExtractedContent:
    """Output of the Extractor module."""
    url: str
    title: str
    text: str                       # Clean text content
    headings: List[str]             # H1-H6 hierarchy
    code_blocks: List[str]          # Extracted code snippets
    tables: List[str]               # Extracted tables as text
    metadata: Dict[str, Any] = field(default_factory=dict)


@dataclass
class Chunk:
    """Output of the Chunker module."""
    chunk_id: str                   # Deterministic hash of content
    source_url: str
    platform: str                   # e.g., "hubspot", "ghl", "xero"
    customer_id: Optional[str]      # For multi-tenant isolation
    title: str
    text: str                       # Chunk text (≤1500 chars)
    heading_context: str            # Parent heading hierarchy
    chunk_index: int                # Position within source page
    total_chunks: int               # Total chunks from source page
    metadata: Dict[str, Any] = field(default_factory=dict)


@dataclass
class EmbeddedChunk:
    """Output of the Embedder module."""
    chunk: Chunk
    vector: List[float]             # 3072-dim gemini-embedding-001
    embedding_model: str            # Always "gemini-embedding-001"


@dataclass
class PlatformConfig:
    """Platform registry entry."""
    name: str                       # e.g., "hubspot"
    display_name: str               # e.g., "HubSpot"
    docs_base_url: str              # e.g., "https://knowledge.hubspot.com"
    sitemap_url: Optional[str] = None
    url_patterns: List[str] = field(default_factory=lambda: ["*"])
    exclude_patterns: List[str] = field(default_factory=list)
    auth_type: str = "none"         # "none", "api_key", "cookie"
    auth_config: Dict[str, str] = field(default_factory=dict)
    chunk_size: int = 1500
    chunk_overlap: int = 200
    max_pages: int = 5000
    refresh_hours: int = 168        # weekly
    use_browserless: bool = False


# ──────────────────────────────────────────────────────────────────────
# Protocol classes — interface contracts
# ──────────────────────────────────────────────────────────────────────

class IFetcher(Protocol):
    async def fetch_page(self, url: str, config: PlatformConfig) -> FetchedPage: ...
    async def fetch_sitemap(self, url: str) -> List[str]: ...
    async def fetch_pages(self, urls: List[str], config: PlatformConfig, concurrency: int = 5) -> List[FetchedPage]: ...


class IExtractor(Protocol):
    def extract(self, page: FetchedPage) -> ExtractedContent: ...
    def extract_batch(self, pages: List[FetchedPage]) -> List[ExtractedContent]: ...


class IChunker(Protocol):
    def chunk(self, content: ExtractedContent, config: PlatformConfig) -> List[Chunk]: ...
    def chunk_batch(self, contents: List[ExtractedContent], config: PlatformConfig) -> List[Chunk]: ...


class IEmbedder(Protocol):
    def embed(self, text: str) -> List[float]: ...
    def embed_batch(self, chunks: List[Chunk], batch_size: int = 50) -> List[EmbeddedChunk]: ...


class IStore(Protocol):
    def upsert_vectors(self, embedded_chunks: List[EmbeddedChunk]) -> int: ...
    def upsert_metadata(self, chunks: List[Chunk]) -> int: ...
    def delete_platform(self, platform: str, customer_id: Optional[str] = None) -> int: ...


class IOrchestrator(Protocol):
    async def ingest_platform(self, platform: str, customer_id: Optional[str] = None) -> Dict[str, Any]: ...
    async def ingest_url(self, url: str, platform: str, customer_id: Optional[str] = None) -> Dict[str, Any]: ...
    def get_status(self, platform: str) -> Dict[str, Any]: ...
