
    i7                        d Z ddlZddlZddlmZmZ ddlmZmZmZm	Z	 ddl
mZmZmZ  ej                  d      Z	 	 ddeded	ed
ee   fdZ ej                  dej(                        Z ej                  dej(                        Zdededed
efdZdeded	ed
ee   fdZdee   d
efdZ	 	 	 ddedede	e   ded	ed
ee   fdZ	 	 d dee   dede	e   de	eeef      d
ee   f
dZ	 d!dee   dede	e   d
ee   fdZy)"u5  
MODULE 3: SMART CHUNKER
========================
Splits ExtractedContent into overlapping Chunk objects, preserving
heading hierarchy context and keeping code blocks intact.

VERIFICATION_STAMP
Story: M3 (Stories 3.01–3.05)
Verified By: parallel-builder
Verified At: 2026-02-26
Tests: 17/17
Coverage: 100%
    N)datetimetimezone)AnyDictListOptional)ChunkExtractedContentPlatformConfigz(?<=[.!?])\s+text
chunk_sizeoverlapreturnc                    | j                         } | sg S t        |       |k  r| gS t        j                  |       }|D cg c]#  }|j                         s|j                         % }}g }g }d}d}	|	t        |      k  rQ||	   }
t        |
      }||kD  r`|r$|j	                  dj                  |             g }d}d}||k  r-||z   }|
|| }|j	                  |       ||k  r||z
  n|}||k  r-|	dz  }	||rdndz   |z   }||kD  r|r|j	                  dj                  |             g }d}t        |      D ],  }|j                  d|       |t        |      dz   z  }||k\  s, n |}t        d |D              t        dt        |      dz
        z   }|j	                  |
       |t        |      dkD  rdnd|z   z  }|	dz  }	|	t        |      k  rQ|r |j	                  dj                  |             |S c c}w )a  Split text into overlapping chunks at sentence boundaries.

    Rules:
    - Prefer splitting at sentence endings (. ! ? followed by whitespace).
    - Each chunk is at most ``chunk_size`` characters.
    - Consecutive chunks share up to ``overlap`` characters at their boundary.
    - Empty input returns an empty list.
    - Text shorter than chunk_size returns a single chunk.
    r       c              3   2   K   | ]  }t        |        y wN)len).0ss     (/mnt/e/genesis-system/core/kb/chunker.py	<genexpr>zchunk_text.<locals>.<genexpr>f   s     <c!f<s   )
stripr   _SENTENCE_END_REsplitappendjoinreversedinsertsummax)r   r   r   partsp	sentenceschunkscurrent_charscurrent_lenisentencesentence_lenstartendpiece	projectedoverlap_sentencesoverlap_lenr   s                      r   
chunk_textr2      s0    ::<D	
4yJv
 ""4(E/4B!	AGGIBIBF!MK	A
c)n
Q<8} *$chh}56 " E,&j( s+e$),|);g	 ,&
 FA  11=L	z!mMM#((=12 ,.Km, !((A.s1vz)')	 .M<m<<s1c-FX[\F\?]]KX&S/!3lJJ	QW c)n
Z chh}-.Mo Cs   G5G5^(#{1,6})\s+(.+)$z```[\s\S]*?```
source_urlchunk_indexc                 z    |  d| d| }t        j                  |j                  d            j                         S )z@Deterministic SHA-256 hash of (source_url + chunk_index + text). zutf-8)hashlibsha256encode	hexdigest)r4   r5   r   payloads       r   _compute_chunk_idr=   z   s9    DT$8G>>'..12<<>>    c                    g }d}t         j                  |       D ][  }| ||j                          }|r|j                  d|f       |j                  d|j	                         f       |j                         }] | |d }|r|j                  d|f       g g ddfd}|D ]  \  }	}
t        |
      }|	rJ||kD  r |        j                  |
       1|z   |kD  r |        |
g|Fj                  |
       |z  ]t        |
||      }|D ]4  }t        |      }|z   |kD  r	r |        j                  |       |z  6   |        D cg c]  }|j                         s| c}S c c}w )u  
    Split *text* into chunks while keeping fenced code blocks (```…```) intact.

    Strategy:
    1. Tokenise the text into alternating prose / code-fence segments.
    2. Pack segments greedily into chunks of at most chunk_size chars.
    3. When a code block fits in the current chunk, add it whole.
    4. When a code block is too large for even an empty chunk, emit it as its
       own (oversized) chunk rather than splitting it mid-block.
    5. Apply overlap at sentence boundaries between consecutive prose chunks.
    r   FTNc                  R    r  j                  dj                               g dy )N r   )r   r   )r&   r(   current_partss   r   flushz,_split_preserving_code_blocks.<locals>.flush   s%    MM"''-01r>   )r   r   )r   N)	_CODE_FENCE_REfinditerr,   r   groupr-   r   r2   r   )r   r   r   segmentslast_endmprosetailrC   is_codesegseg_lenprose_chunkspcpc_lencr&   r(   rB   s                   @@@r   _split_preserving_code_blocksrS      s   " (*HH$$T* Xaggi(OOUEN+qwwy)*557 	?D&F!MK ! &c(#c"w&3!$%$$S)w& &cj'RL" &R'*4G$$R(v%&'&4 
G+!A+++s   E3,E3lines_beforec                 N   g }| D ]  }t        j                  d|      }|st        |j                  d            }|j                  d      j	                         }|D cg c]  }|d   |k  s| }}|j                  ||f        |sydj                  d |D              S c c}w )zDReturn the active heading hierarchy as a string like '# H1 > ## H2'.r3   r      r   rA   z > c              3   6   K   | ]  \  }}d |z   d|   yw)#r   N )r   lvltxts      r   r   z+_extract_heading_context.<locals>.<genexpr>   s#     IxsCs1SE*Is   )rematchr   rF   r   r   r   )rT   heading_stacklinerI   leveltitlehs          r   _extract_heading_contextrc      s    !M 1HH)40
OEGGAJ$$&E(5F11QFMF  %01 ::I=III	 Gs   B",B"contentplatformcustomer_idc                 n   | j                   j                         }|sg S |j                         }g }g }g }	|D ]`  }
t        j                  d|
      }|r$|r"t        |	      }|j                  ||f       |
g}n|j                  |
       |	j                  |
       b |rt        |	      }|j                  ||f       g }|D ]m  \  }}dj                  |      j                         }|s(t        |||      }|D ]4  }|j                         s|j                  ||j                         f       6 o t        |      }g }t        |      D ]l  \  }\  }}t        | j                  ||      }t        || j                  ||| j                  ||||t        | j                         
      }|j                  |       n |S )a+  Chunk content preserving heading hierarchy context.

    - Each Chunk's ``heading_context`` contains the nearest parent headings.
    - Prefers to split at heading boundaries.
    - Keeps fenced code blocks intact.
    - chunk_id is a deterministic SHA-256 of (source_url + chunk_index + text).
    z
^#{1,6}\s+

chunk_idr4   re   rf   ra   r   heading_contextr5   total_chunksmetadata)r   r   
splitlinesr\   r]   rc   r   r   rS   r   	enumerater=   urlr	   ra   dictrm   )rd   re   rf   r   r   r   linessectionscurrent_section_lines
seen_linesr_   
is_headingctx
raw_chunksheading_ctx	sec_linessection_text
sub_chunkssctotalresultidxchunk_text_valrj   chunks                            r   chunk_with_headingsr      s    <<D	OOE -/H')J  XXmT2
/*:6COOS"789%)F!!((.$  &z2345 )+J"* =Yyy+1132<WU
 	=Bxxz!!;
";<	== 
OEF.7
.C **k>$W[[#~F{{#--''**+
 	e  Mr>   r&   extra_metadatac                    t        j                  t        j                        j	                  d      }g }| D ]  }t        |j                        }||d<   |j                  |d<   |r|j                  |       t        |j                  |j                  |||j                  |j                  |j                  |j                  |j                  |
      }|j!                  |        |S )a  Apply platform and customer tags to chunks for multi-tenant isolation.

    - Returns NEW Chunk objects (originals are not mutated).
    - Adds ``ingested_at`` (ISO 8601) and ``source_url`` to metadata.
    - Merges ``extra_metadata`` into each chunk's metadata.
    )tzz%Y-%m-%dT%H:%M:%SZingested_atr4   ri   )r   nowr   utcstrftimerq   rm   r4   updater	   rj   ra   r   rk   r5   rl   r   )	r&   re   rf   r   now_isor   r   new_metataggeds	            r   
tag_chunksr   &  s     llhll+445IJGF #'#7")!&!1!1OON+^^''#++!11))++
 	f'* Mr>   contentsconfigc                     g }| D ]k  }||j                   r|j                   j                         s-t        ||j                  ||j                  |j
                        }|j                  |       m |S )zChunk multiple extracted contents using platform config.

    - Uses config.chunk_size and config.chunk_overlap for all documents.
    - Skips None or empty-text contents gracefully.
    - Returns a flat list of all resulting Chunks.
    )rd   re   rf   r   r   )r   r   r   namer   chunk_overlapextend)r   r   rf   
all_chunksrd   page_chunkss         r   chunk_batchr   Q  sz     !J '?||7<<#5#5#7)[[#((((
 	+&' r>   )     )Nr   r   )NNr   )__doc__r8   r\   r   r   typingr   r   r   r   core.kb.contractsr	   r
   r   compiler   strintlistr2   	MULTILINE_HEADING_RErD   r=   rS   rc   r   r   r   rY   r>   r   <module>r      s    	 ' , , E E 2::./ 
 P
PP P 
#Y	Pn bjj-r||<-r||<?# ?C ?s ?s ?C,
C,C, C, 
#Y	C,LJ49 J J. "&EEE #E 	E
 E 
%[E^ "&/3	$K$$ #$ T#s(^,	$
 
%[$\ "&#$ # 
%[	r>   