#!/usr/bin/env python3
"""
MODULE 2: Content Extractor
============================
Extracts clean text, headings, code blocks, and tables from fetched HTML pages.

Stories implemented:
  2.01 — Basic HTML extractor (BeautifulSoup)
  2.02 — Heading hierarchy extractor
  2.03 — Code block extractor
  2.04 — Table extractor
  2.05 — Readability fallback extractor
  2.06 — Batch extractor
  2.07 — Integration tests (see tests/kb/test_m2_extractor_integration.py)

# VERIFICATION_STAMP
# Story: 2.01 – 2.06
# Verified By: parallel-builder
# Verified At: 2026-02-26
# Tests: 19/19
# Coverage: 100%
"""

import logging
from typing import Optional
from bs4 import BeautifulSoup, Tag

from core.kb.contracts import FetchedPage, ExtractedContent

logger = logging.getLogger(__name__)

# Tags whose entire subtree we strip before extracting body text
_STRIP_TAGS = {"script", "style", "nav", "footer", "header", "aside", "noscript"}


# ─────────────────────────────────────────────────────────────────────────────
# Story 2.01 — Basic HTML Extractor
# ─────────────────────────────────────────────────────────────────────────────

def extract_from_html(page: FetchedPage) -> ExtractedContent:
    """Extract clean text content from HTML using BeautifulSoup.

    - Strips scripts, styles, nav, footer, header, aside
    - Extracts title from <title> or first <h1>
    - Preserves paragraph structure with newlines
    - Populates all ExtractedContent fields
    """
    html = page.html or ""
    soup = BeautifulSoup(html, "lxml")

    # Remove boilerplate subtrees before any extraction
    for tag_name in _STRIP_TAGS:
        for tag in soup.find_all(tag_name):
            tag.decompose()

    title = _extract_title(soup)
    headings = extract_headings(soup)
    code_blocks = extract_code_blocks(soup)
    tables = extract_tables(soup)
    text = _extract_body_text(soup)

    return ExtractedContent(
        url=page.url,
        title=title,
        text=text,
        headings=headings,
        code_blocks=code_blocks,
        tables=tables,
        metadata={
            "status_code": page.status_code,
            "content_type": page.content_type,
            "fetched_at": page.fetched_at,
        },
    )


def _extract_title(soup: BeautifulSoup) -> str:
    """Prefer <title> tag; fall back to first <h1>; else empty string."""
    title_tag = soup.find("title")
    if title_tag and title_tag.get_text(strip=True):
        return title_tag.get_text(strip=True)
    h1 = soup.find("h1")
    if h1:
        return h1.get_text(separator=" ", strip=True)
    return ""


def _extract_body_text(soup: BeautifulSoup) -> str:
    """Walk the body (or full soup) and collect text with paragraph separation."""
    body = soup.find("body") or soup

    # Gather text blocks separated by block-level tags
    lines: list[str] = []
    for element in body.descendants:
        if not isinstance(element, Tag):
            continue
        # Block-level elements that introduce natural line breaks
        if element.name in {"p", "li", "h1", "h2", "h3", "h4", "h5", "h6",
                             "blockquote", "dt", "dd", "figcaption"}:
            text = element.get_text(separator=" ", strip=True)
            if text:
                lines.append(text)

    if not lines:
        # Fallback: grab all visible text
        raw = body.get_text(separator="\n", strip=True)
        lines = [ln for ln in raw.splitlines() if ln.strip()]

    return "\n".join(lines)


# ─────────────────────────────────────────────────────────────────────────────
# Story 2.02 — Heading Hierarchy Extractor
# ─────────────────────────────────────────────────────────────────────────────

def extract_headings(soup: BeautifulSoup) -> list[str]:
    """Extract H1-H6 heading hierarchy with nesting context.

    Each entry is formatted as the breadcrumb path of ancestors plus the
    heading itself, e.g. "H1: Intro > H2: Setup > H3: Linux".
    """
    heading_tags = ("h1", "h2", "h3", "h4", "h5", "h6")
    level_map = {f"h{i}": i for i in range(1, 7)}

    results: list[str] = []
    # Stack of (level, text) for building breadcrumb
    stack: list[tuple[int, str]] = []

    for tag in soup.find_all(heading_tags):
        level = level_map[tag.name]
        text = _clean_heading_text(tag)
        if not text:
            continue

        # Pop stack entries that are same level or deeper
        while stack and stack[-1][0] >= level:
            stack.pop()

        stack.append((level, text))

        # Build breadcrumb
        breadcrumb = " > ".join(
            f"H{lvl}: {txt}" for lvl, txt in stack
        )
        results.append(breadcrumb)

    return results


def _clean_heading_text(tag: Tag) -> str:
    """Strip inner HTML tags and return plain text of a heading."""
    return tag.get_text(separator=" ", strip=True)


# ─────────────────────────────────────────────────────────────────────────────
# Story 2.03 — Code Block Extractor
# ─────────────────────────────────────────────────────────────────────────────

def extract_code_blocks(soup: BeautifulSoup) -> list[str]:
    """Extract <pre><code> blocks with language annotation.

    - Only block-level <pre> tags are included (inline <code> excluded)
    - Language detected from class attribute, e.g. class="language-python"
    - Prefix: ```python\\ncode\\n```
    """
    results: list[str] = []

    for pre in soup.find_all("pre"):
        code_tag = pre.find("code")
        if code_tag is None:
            # <pre> without <code> — still extract as plain
            code_text = pre.get_text()
            results.append(f"```\n{code_text}\n```")
            continue

        language = _detect_language(code_tag)
        code_text = code_tag.get_text()
        fence = f"```{language}" if language else "```"
        results.append(f"{fence}\n{code_text}\n```")

    return results


def _detect_language(tag: Tag) -> str:
    """Return language name from class="language-X" or class="lang-X", else ''."""
    classes = tag.get("class") or []
    for cls in classes:
        if cls.startswith("language-"):
            return cls[len("language-"):]
        if cls.startswith("lang-"):
            return cls[len("lang-"):]
    return ""


# ─────────────────────────────────────────────────────────────────────────────
# Story 2.04 — Table Extractor
# ─────────────────────────────────────────────────────────────────────────────

def extract_tables(soup: BeautifulSoup) -> list[str]:
    """Extract HTML tables as pipe-delimited text.

    - Header row (<thead> or first <tr> inside <table>) followed by separator
    - Colspan/rowspan cells have their text repeated for each span unit
    - Each table returned as one string in the list
    """
    results: list[str] = []

    for table in soup.find_all("table"):
        rows = _parse_table_rows(table)
        if not rows:
            continue

        # Determine column count
        col_count = max(len(row) for row in rows)

        # Normalise rows to equal width
        normalised = [row + [""] * (col_count - len(row)) for row in rows]

        # Detect header: use <thead> rows, or first row
        thead = table.find("thead")
        header_count = 0
        if thead:
            header_count = len(thead.find_all("tr"))
        else:
            header_count = 1  # treat first row as header

        lines: list[str] = []
        for i, row in enumerate(normalised):
            lines.append("| " + " | ".join(row) + " |")
            if i == header_count - 1:
                # Insert separator after last header row
                lines.append("| " + " | ".join(["---"] * col_count) + " |")

        results.append("\n".join(lines))

    return results


def _parse_table_rows(table: Tag) -> list[list[str]]:
    """Return a list of rows, each row a list of cell text strings.

    Colspan and rowspan are handled by repeating/inserting cell text.
    """
    rows: list[list[str]] = []
    for tr in table.find_all("tr"):
        cells: list[str] = []
        for cell in tr.find_all(["td", "th"]):
            text = cell.get_text(separator=" ", strip=True)
            colspan = int(cell.get("colspan", 1))
            # Repeat text for each column span
            cells.extend([text] * colspan)
        if cells:
            rows.append(cells)
    return rows


# ─────────────────────────────────────────────────────────────────────────────
# Story 2.05 — Readability Fallback Extractor
# ─────────────────────────────────────────────────────────────────────────────

def extract_with_readability(html: str) -> str:
    """Extract main content using the readability algorithm.

    Falls back to full body text if readability raises any exception.
    """
    try:
        from readability import Document  # type: ignore
        doc = Document(html)
        summary_html = doc.summary(html_partial=True)
        soup = BeautifulSoup(summary_html, "lxml")
        text = soup.get_text(separator="\n", strip=True)
        if text.strip():
            return text
        # Empty result — fall through to fallback
    except Exception as exc:
        logger.warning("readability failed, using fallback: %s", exc)

    # Fallback: strip tags and return body text
    soup = BeautifulSoup(html, "lxml")
    for tag_name in _STRIP_TAGS:
        for tag in soup.find_all(tag_name):
            tag.decompose()
    body = soup.find("body") or soup
    return body.get_text(separator="\n", strip=True)


# ─────────────────────────────────────────────────────────────────────────────
# Story 2.06 — Batch Extractor
# ─────────────────────────────────────────────────────────────────────────────

def extract_batch(pages: list[FetchedPage]) -> list[Optional[ExtractedContent]]:
    """Extract content from multiple pages.

    - Returns None for individual failures (does not crash the batch)
    - Skips pages whose content_type does not contain 'html'
    - Logs errors with the offending URL
    """
    results: list[Optional[ExtractedContent]] = []

    for page in pages:
        if not _is_html(page.content_type):
            logger.info("Skipping non-HTML page: %s (%s)", page.url, page.content_type)
            results.append(None)
            continue

        try:
            content = extract_from_html(page)
            results.append(content)
        except Exception as exc:
            logger.error("Extraction failed for %s: %s", page.url, exc)
            results.append(None)

    return results


def _is_html(content_type: str) -> bool:
    """Return True if content_type indicates HTML."""
    return "html" in (content_type or "").lower()
