
    i0                     X   d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	  ej                  e      Zh dZdede	fd	Zd
edefdZd
edefdZd
edee   fdZdedefdZd
edee   fdZdedefdZd
edee   fdZdedeee      fdZdedefdZdee   deee	      fdZdedefdZy)uK  
MODULE 2: Content Extractor
============================
Extracts clean text, headings, code blocks, and tables from fetched HTML pages.

Stories implemented:
  2.01 — Basic HTML extractor (BeautifulSoup)
  2.02 — Heading hierarchy extractor
  2.03 — Code block extractor
  2.04 — Table extractor
  2.05 — Readability fallback extractor
  2.06 — Batch extractor
  2.07 — Integration tests (see tests/kb/test_m2_extractor_integration.py)

# VERIFICATION_STAMP
# Story: 2.01 – 2.06
# Verified By: parallel-builder
# Verified At: 2026-02-26
# Tests: 19/19
# Coverage: 100%
    N)Optional)BeautifulSoupTag)FetchedPageExtractedContent>   navasidestylefooterheaderscriptnoscriptpagereturnc                    | j                   xs d}t        |d      }t        D ](  }|j                  |      D ]  }|j	                           * t        |      }t        |      }t        |      }t        |      }t        |      }	t        | j                  ||	|||| j                  | j                  | j                  d      S )a  Extract clean text content from HTML using BeautifulSoup.

    - Strips scripts, styles, nav, footer, header, aside
    - Extracts title from <title> or first <h1>
    - Preserves paragraph structure with newlines
    - Populates all ExtractedContent fields
     lxml)status_codecontent_type
fetched_at)urltitletextheadingscode_blockstablesmetadata)htmlr   _STRIP_TAGSfind_all	decompose_extract_titleextract_headingsextract_code_blocksextract_tables_extract_body_textr   r   r   r   r   )
r   r   souptag_nametagr   r   r   r   r   s
             */mnt/e/genesis-system/core/kb/extractor.pyextract_from_htmlr+   (   s     99?Dv&D   ==* 	CMMO	 4 E%H%d+KD!Fd#DHH++ --//
     r'   c                     | j                  d      }|r$|j                  d      r|j                  d      S | j                  d      }|r|j                  dd      S y)z?Prefer <title> tag; fall back to first <h1>; else empty string.r   T)striph1 	separatorr.   r   )findget_text)r'   	title_tagr/   s      r*   r"   r"   M   s]    		'"IY''d'3!!!--	4B	{{S{55r,   c                    | j                  d      xs | }g }|j                  D ]I  }t        |t              s|j                  dv s#|j                  dd      }|s9|j                  |       K |sA|j                  dd      }|j                         D cg c]  }|j                         s| }}dj                  |      S c c}w )zHWalk the body (or full soup) and collect text with paragraph separation.body>   pdddtr/   h2h3h4h5h6li
blockquote
figcaptionr0   Tr1   
)
r3   descendants
isinstancer   namer4   append
splitlinesr.   join)r'   r7   lineselementr   rawlns          r*   r&   r&   X   s    99V$D E## #'3'<< E E##c#>DT"# mmd$m7!nn.="((*==99U >s   C-Cc                    d}t        dd      D ci c]  }d| |
 }}g }g }| j                  |      D ]  }||j                     }t        |      }|s |r)|d   d   |k\  r|j	                          |r|d   d   |k\  r|j                  ||f       dj                  d |D              }	|j                  |	        |S c c}w )	zExtract H1-H6 heading hierarchy with nesting context.

    Each entry is formatted as the breadcrumb path of ancestors plus the
    heading itself, e.g. "H1: Intro > H2: Setup > H3: Linux".
    )r/   r;   r<   r=   r>   r?         hr   z > c              3   2   K   | ]  \  }}d | d|   yw)Hz: N ).0lvltxts      r*   	<genexpr>z#extract_headings.<locals>.<genexpr>   s&       
!)cauBse 
   )ranger    rF   _clean_heading_textpoprG   rI   )
r'   heading_tagsi	level_mapresultsstackr)   levelr   
breadcrumbs
             r*   r#   r#   t   s     8L%*1a[11QC!1I1G#%E}}\* ##((#"3' b	!-IIK b	!- 	eT]# ZZ  
-2 
 

 	z"!#$ N1 2s   C r)   c                 (    | j                  dd      S )z9Strip inner HTML tags and return plain text of a heading.r0   Tr1   )r4   )r)   s    r*   r\   r\      s    <<#T<22r,   c                    g }| j                  d      D ]v  }|j                  d      }|&|j                         }|j                  d| d       <t	        |      }|j                         }|rd| nd}|j                  | d| d       x |S )zExtract <pre><code> blocks with language annotation.

    - Only block-level <pre> tags are included (inline <code> excluded)
    - Language detected from class attribute, e.g. class="language-python"
    - Prefix: ```python\ncode\n```
    precodez```
z
```z```rC   )r    r3   r4   rG   _detect_language)r'   ra   rg   code_tag	code_textlanguagefences          r*   r$   r$      s     G}}U# 588F#INNU9+U34#H-%%'	$,#hZ %%9+U345 Nr,   c                     | j                  d      xs g }|D ]D  }|j                  d      r|t        d      d c S |j                  d      s6|t        d      d c S  y)zHReturn language name from class="language-X" or class="lang-X", else ''.classz	language-Nzlang-r   )get
startswithlen)r)   classesclss      r*   ri   ri      se    ggg$"G &>>+&s;'())>>'"s7|}%%	&
 r,   c           
      H   g }| j                  d      D ]  }t        |      }|st        d |D              }|D cg c]  }|dg|t        |      z
  z  z    }}|j	                  d      }d}|rt        |j                  d            }nd}g }	t        |      D ]^  \  }
}|	j                  dd	j                  |      z   d
z          |
|dz
  k(  s5|	j                  dd	j                  dg|z        z   d
z          ` |j                  dj                  |	              |S c c}w )zExtract HTML tables as pipe-delimited text.

    - Header row (<thead> or first <tr> inside <table>) followed by separator
    - Colspan/rowspan cells have their text repeated for each span unit
    - Each table returned as one string in the list
    tablec              3   2   K   | ]  }t        |        y w)N)rr   )rV   rows     r*   rY   z!extract_tables.<locals>.<genexpr>   s     1SC1rZ   r   theadr   trrO   z| z | z |z---rC   )r    _parse_table_rowsmaxrr   r3   	enumeraterG   rI   )r'   ra   rv   rows	col_countrx   
normalisedry   header_countrJ   r_   s              r*   r%   r%      s3    Gw' ) ' 1D11	 FJJccRDIC$899J
J 

7#u~~d34LL
+ 	LFAsLL

3/$67L1$$TEJJw/B$CCdJK		L 	tyy'(5)8 N' Ks   Drv   c                    g }| j                  d      D ]s  }g }|j                  ddg      D ]E  }|j                  dd      }t        |j                  dd            }|j	                  |g|z         G |sc|j                  |       u |S )	zReturn a list of rows, each row a list of cell text strings.

    Colspan and rowspan are handled by repeating/inserting cell text.
    rz   tdthr0   Tr1   colspanrO   )r    r4   intrp   extendrG   )rv   r~   rz   cellscellr   r   s          r*   r{   r{      s    
 DnnT" KKt- 	+D==3d=;D$((9a01GLL$')*		+
 KK Kr,   r   c                    	 ddl m}  ||       }|j                  d      }t        |d      }|j	                  dd      }|j                         r|S 	 t        | d      }t        D ](  }|j                  |      D ]  }|j                           * |j                  d
      xs |}	|	j	                  dd      S # t        $ r }t        j                  d|       Y d	}~d	}~ww xY w)zExtract main content using the readability algorithm.

    Falls back to full body text if readability raises any exception.
    r   )DocumentT)html_partialr   rC   r1   z&readability failed, using fallback: %sNr7   )readabilityr   summaryr   r4   r.   	Exceptionloggerwarningr   r    r!   r3   )
r   r   docsummary_htmlr'   r   excr(   r)   r7   s
             r*   extract_with_readabilityr     s    

F(tn{{{5\62}}t4}8::<K  v&D ==* 	CMMO	 99V$D==4t=44  F?EEFs   AB9 9	C"CC"pagesc                    g }| D ]q  }t        |j                        s=t        j                  d|j                  |j                         |j                  d       U	 t        |      }|j                  |       s |S # t        $ r<}t        j                  d|j                  |       |j                  d       Y d}~d}~ww xY w)zExtract content from multiple pages.

    - Returns None for individual failures (does not crash the batch)
    - Skips pages whose content_type does not contain 'html'
    - Logs errors with the offending URL
    zSkipping non-HTML page: %s (%s)NzExtraction failed for %s: %s)	_is_htmlr   r   infor   rG   r+   r   error)r   ra   r   contentr   s        r*   extract_batchr   #  s     13G !))*KK9488TEVEVWNN4 	!'-GNN7#! N	  	!LL73GNN4  	!s   A;;	C 2B;;C r   c                 .    d| xs dj                         v S )z+Return True if content_type indicates HTML.r   r   )lower)r   s    r*   r   r   <  s    l(b//111r,   )__doc__loggingtypingr   bs4r   r   core.kb.contractsr   r   	getLogger__name__r   r   r+   strr"   r&   listr#   r\   r$   ri   r%   r{   r   r   boolr   rU   r,   r*   <module>r      s>  ,   " ;			8	$ R"K ",< "J 3 ] s 8= T#Y D3S 3S 3m S	 2# # % %49 %PS T$s)_ ,53 53 5<k* tH=M4N/O 223 24 2r,   