
    [i                     4    d dl Z  G d d      Z G d d      Zy)    Nc                   ,    e Zd Zededee   fd       Zy)SimpleTokenizertextreturnc                     t        j                  dd| j                               } t        j                  dd|       } | j                         j	                         S )Nz[^\w] z\s+)resublowerstripsplit)r   s    \/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/fastembed/sparse/utils/tokenizer.pytokenizezSimpleTokenizer.tokenize   sA    vvhTZZ\2vvfc4(zz|!!##    N)__name__
__module____qualname__staticmethodstrlistr    r   r   r   r      s&    $s $tCy $ $r   r   c                      e Zd ZdZ ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d	ej                        d
fgZ ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      dfgZ ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  dej                        dfg	Z	 ej
                  d       dfZ
 ej
                  d!      d"fZd#D  cg c]  }t	        j
                  |       c}}} Zd$D  cg c]  }t	        j
                  |       c}}} Zed%ed&ee   fd'       Zy(c c}}} w c c}}} w ))WordTokenizerzThe tokenizer is "destructive" such that the regexes applied will munge the
    input string to a state beyond re-construction.
    u   ([«“‘„]|[`]+)z \1 z^\"z``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u   ([»”’])z''z '' "z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&   ([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$z	\1 \2 \3 z([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z--z -- )z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s))z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\br   r   c                 &   | j                   D ]  \  }}|j                  ||      } | j                  D ]  \  }}|j                  ||      } | j                  \  }}|j                  ||      }| j                  \  }}|j                  ||      }d|z   dz   }| j
                  D ]  \  }}|j                  ||      } | j                  D ]  }|j                  d|      } | j                  D ]  }|j                  d|      } |j                         S )a  Return a tokenized copy of `text`.

        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)
in New York.'''
        >>> WordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', 'euros', ')', 'in', 'New', 'York', '.']

        Args:
            text: The text to be tokenized.

        Returns:
            A list of tokens.
        r   z \1 \2 )	STARTING_QUOTESr
   PUNCTUATIONPARENS_BRACKETSDOUBLE_DASHESENDING_QUOTESCONTRACTIONS2CONTRACTIONS3r   )clsr   regexpsubstitutions       r   r   zWordTokenizer.tokenizeR   s.    %($7$7 	2 FL::lD1D	2 %(OO 	2 FL::lD1D	2  #22zz,-  #00zz,- TzC$'$5$5 	2 FL::lD1D	2 '' 	0F::j$/D	0'' 	0F::j$/D	0zz|r   N)r   r   r   __doc__r	   compileUr   r    r   r   r   r!   r"   classmethodr   r   r   ).0patternr	   s   000r   r   r      so    
*BDD	17;	F	U#	G	g&	,	-y9	;RTT	BHMO 
NBDD	)73	E	F#	D	6"	4	5yA	@	A9MM 
Dbdd	K\Z	$	%y1	J	)BJJy"$$'	
 
K	 *-BJJ78	
 
G	j)	K	 (+BJJvrtt$	
K, "rzz"78*EORZZ&0M
	
  	

7M ,_  '

7M %C %DI % %%s   &H?Ir   )r	   r   r   r   r   r   <module>r,      s     
$ $i ir   