
    ZiF4                       d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	 ddl
m
Z ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ erdd	lmZ d
Z G d de      Z ed      dd       Z ed      dd       Z  ed      dd       Z! ed      dd       Z" ed      dd       Z# G d de	      Z$ ed      dd       Z%	 	 	 	 	 	 	 	 	 	 	 	 d dZ&	 	 d!	 	 	 	 	 	 	 d"dZ'd#dZ(d$dZ)	 	 d!	 	 	 	 	 	 	 d"dZ*y)%z
Grapheme cluster segmentation following Unicode Standard Annex #29.

This module provides pure-Python implementation of the grapheme cluster boundary algorithm as
defined in UAX #29: Unicode Text Segmentation.

https://www.unicode.org/reports/tr29/
    )annotations)IntEnum)	lru_cache)TYPE_CHECKING
NamedTuple   )bisearch)
GRAPHEME_L
GRAPHEME_T
GRAPHEME_VGRAPHEME_LVINCB_EXTENDINCB_LINKERGRAPHEME_LVTINCB_CONSONANTGRAPHEME_EXTENDGRAPHEME_CONTROLGRAPHEME_PREPENDGRAPHEME_SPACINGMARKEXTENDED_PICTOGRAPHICGRAPHEME_REGIONAL_INDICATOR)Iterator    c                  H    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZy)GCBz'Grapheme Cluster Break property values.r   r                        	   
            N)__name__
__module____qualname____doc__OTHERCRLFCONTROLEXTENDZWJREGIONAL_INDICATORPREPENDSPACING_MARKLVTLVLVT     L/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/wcwidth/grapheme.pyr   r   ,   sL    1E	
B	
BGF
CGL	A
A
A	B
Cr;   r   i   )maxsizec                    | dk(  rt         j                  S | dk(  rt         j                  S | dk(  rt         j                  S t	        | t
              rt         j                  S t	        | t              rt         j                  S t	        | t              rt         j                  S t	        | t              rt         j                  S t	        | t              rt         j                  S t	        | t              rt         j                   S t	        | t"              rt         j$                  S t	        | t&              rt         j(                  S t	        | t*              rt         j,                  S t	        | t.              rt         j0                  S t         j2                  S )z;Return the Grapheme_Cluster_Break property for a codepoint.r'   r$   i   )r   r-   r.   r1   	_bisearchr   r/   r   r0   r   r2   r   r3   r   r4   r
   r5   r   r6   r   r7   r   r8   r   r9   r,   ucss    r<   _grapheme_cluster_breakrB   B   s	   
 f}vv
f}vv
f}ww&'{{o&zz12%%%&'{{*+j!uuj!uuj!uuk"vvl#ww99r;   c                4    t        t        | t                    S )z6Check if codepoint has Extended_Pictographic property.)boolr?   r   r@   s    r<   _is_extended_pictographicrE   e   s     	#4566r;   c                4    t        t        | t                    S )z,Check if codepoint has InCB=Linker property.)rD   r?   r   r@   s    r<   _is_incb_linkerrG   k        	#{+,,r;   c                4    t        t        | t                    S )z/Check if codepoint has InCB=Consonant property.)rD   r?   r   r@   s    r<   _is_incb_consonantrJ   q   s     	#~.//r;   c                4    t        t        | t                    S )z,Check if codepoint has InCB=Extend property.)rD   r?   r   r@   s    r<   _is_incb_extendrL   w   rH   r;   c                  &    e Zd ZU dZded<   ded<   y)BreakResultz*Result of grapheme cluster break decision.rD   should_breakintri_countN)r(   r)   r*   r+   __annotations__r:   r;   r<   rN   rN   }   s    4Mr;   rN   c                   | t         j                  k(  r |t         j                  k(  rt        dd      S | t         j                  t         j                  t         j                  fv rt        dd      S |t         j                  t         j                  t         j                  fv rt        dd      S | t         j
                  k(  rM|t         j
                  t         j                  t         j                  t         j                  fv rt        dd      S | t         j                  t         j                  fv r/|t         j                  t         j                  fv rt        dd      S | t         j                  t         j                  fv r |t         j                  k(  rt        dd      S |t         j                  k(  rt        dd      S |t         j                  k(  rt        dd      S | t         j                  k(  rt        dd      S y)z
    Check simple GCB-pair-based break rules (cacheable).

    Returns BreakResult for rules that can be determined from GCB properties alone, or None if
    complex lookback rules (GB9c, GB11) need to be checked.
    Fr   rO   rQ   TN)r   r-   r.   rN   r/   r5   r6   r8   r9   r7   r0   r4   r3   )prev_gcbcurr_gcbs     r<   _simple_break_checkrW      st    366h#&&0:: CKK00q99 CKK00q99 355X#%%)HH:: CFFCEE?"xCEE355>'A:: CGGSUU##CEE(9:: 3:::: 3###:: 3;;:: r;   c                $   t        | |      }||S |t        j                  k(  rt        dd      S t	        ||         }t        |      r`d}|dz
  }|dk\  rTt	        ||         }	t        |	      rd}|dz  }n-t        |	      r|dz  }nt        |	      r|rt        dd      S nn|dk\  rT| t        j                  k(  rft        |      r[|dz
  }|dk\  rQt	        ||         }	t        |	      }
|
t        j                  k(  r|dz  }nt        |	      rt        dd      S n|dk\  rQ| t        j                  k(  r8|t        j                  k(  r%|dz  dk(  rt        d|dz         S t        dd      S |t        j                  k(  rdnd}t        d|      S )z
    Determine if there should be a grapheme cluster break between prev and curr.

    Implements UAX #29 grapheme cluster boundary rules.
    Fr   rT   r   Tr   )rW   r   r1   rN   ordrJ   rG   rL   rE   rB   r0   r2   )rU   rV   textcurr_idxrQ   resultcurr_ucs
has_linkeriprev_ucs	prev_props              r<   _should_breakrb      s    !84F 377::
 4>"H(#
qL1f47|Hx(!
Q *Q#H-&EAFF 1f 3778BqL1f47|H/9ICJJ&Q*84"BB 1f 3)))h#:P:P.Pa<1EHqLIIq99  6 66qAHD8<<r;   Nc              #    K   | syt        |       }||}||k\  s||k\  ryt        ||      }|}d}t        t        | |               }|t        j
                  k(  rd}t        |dz   |      D ]K  }t        t        | |               }t        ||| ||      }	|	j                  }|	j                  r	| ||  |}|}M | ||  yw)aP  
    Iterate over grapheme clusters in a Unicode string.

    Grapheme clusters are "user-perceived characters" - what a user would
    consider a single character, which may consist of multiple Unicode
    codepoints (e.g., a base character with combining marks, emoji sequences).

    :param unistr: The Unicode string to segment.
    :param start: Starting index (default 0).
    :param end: Ending index (default len(unistr)).
    :yields: Grapheme cluster substrings.

    Example::

        >>> list(iter_graphemes('cafe\u0301'))
        ['c', 'a', 'f', 'e\u0301']
        >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467'))
        ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']
        >>> list(iter_graphemes('\U0001F1FA\U0001F1F8'))
        ['o', 'k', '\U0001F1FA\U0001F1F8']

    .. versionadded:: 0.3.0
    Nr   r   )
lenminrB   rY   r   r2   rangerb   rQ   rO   )
unistrstartendlengthcluster_startrQ   rU   idxrV   r\   s
             r<   iter_graphemesrm      s     8 [F
{|u
c6
C MH 's6%='9:H 3)))UQY$ 
*3vc{+;<x63I??s++M
 s
##s   B?Cc                   t        | |dz
           }|dk(  r|dk\  r| |dz
     dk(  r|dz
  S |dk  rP|dk\  rF|dk\  rAt        | |dz
           }|dk\  r+t        |      t        j                  k(  rt	        | |dz
        S |dz
  S |dz
  }|dkD  r]||z
  t
        k  rQt        | |         }d|cxk  rdk  rn nn4t        |      t        j                  k(  rn|dz  }|dkD  r||z
  t
        k  rQ|}t        t        | |               }|t        j                  k(  rdnd}t        |dz   |      D ]D  }	t        t        | |	               }
t        ||
| |	|      }|j                  }|j                  r|	}|
}F |S )a  
    Find the start of the grapheme cluster containing the character before pos.

    Scans backwards from pos to find a safe starting point, then iterates forward using standard
    break rules to find the actual cluster boundary.

    :param text: The Unicode string.
    :param pos: Position to search before (exclusive).
    :returns: Start position of the grapheme cluster.
    r   r$   r      r   r   )rY   rB   r   r3   _find_cluster_startMAX_GRAPHEME_SCANr/   r2   rf   rb   rQ   rO   )rZ   pos	target_cpprev_cp
safe_startcprk   left_gcbrQ   r_   	right_gcbr\   s               r<   rq   rq   <  s    DqM"I DSAX$sQw-4*?Qw 4!8	T)$sQw-(G$#:7#Cs{{#R*4q99Qw qJ
q.cJ.2CCj!"2"2&#++5a
 q.cJ.2CC M&s4
+;'<=H 6 66qAH:>3' +CQL9	xD!XF??M r;   c           	     L    |dk  ryt        | t        |t        |                   S )a  
    Find the grapheme cluster boundary immediately before a position.

    :param unistr: The Unicode string to search.
    :param pos: Position in the string (0 < pos <= len(unistr)).
    :returns: Start index of the grapheme cluster containing the character at pos-1.

    Example::

        >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)
        6
        >>> grapheme_boundary_before('a\r\nb', 3)
        1

    .. versionadded:: 0.3.6
    r   )rq   re   rd   )rg   rs   s     r<   grapheme_boundary_beforer{   p  s&    " axvs3F'<==r;   c              #     K   | syt        |       }||nt        ||      }t        |d      }||k\  s||k\  ry|}||kD  r"t        | |      }||k  ry| ||  |}||kD  r!yyw)a  
    Iterate over grapheme clusters in reverse order (last to first).

    :param unistr: The Unicode string to segment.
    :param start: Starting index (default 0).
    :param end: Ending index (default len(unistr)).
    :yields: Grapheme cluster substrings in reverse order.

    Example::

        >>> list(iter_graphemes_reverse('cafe\u0301'))
        ['e\u0301', 'f', 'a', 'c']

    .. versionadded:: 0.3.6
    Nr   )rd   re   maxrq   )rg   rh   ri   rj   rs   rk   s         r<   iter_graphemes_reverser~     s     ( [FK&Sf%5CqME|u
C
++FC85 ]3'' +s   AA" A")rA   rP   returnr   )rA   rP   r   rD   )rU   r   rV   r   r   zBreakResult | None)rU   r   rV   r   rZ   strr[   rP   rQ   rP   r   rN   )r   N)rg   r   rh   rP   ri   z
int | Noner   zIterator[str])rZ   r   rs   rP   r   rP   )rg   r   rs   rP   r   rP   )+r+   
__future__r   enumr   	functoolsr   typingr   r   r	   r?   table_graphemer
   r   r   r   r   r   r   r   r   r   r   r   r   r   collections.abcr   rr   r   rB   rE   rG   rJ   rL   rN   rW   rb   rm   rq   r{   r~   r:   r;   r<   <module>r      s   #   , ,: : : : (  ' , 4 D 47 7
 4- -
 40 0
 4- -
*  4- -`@=@=@= @= 	@=
 @= @=J A$A$A$ 
A$ 	A$H1h>0 &&& 
& 	&r;   