
    [i                         d dl mZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
mZ d dlmZ  G d d	      Z G d
 de      Z G d d      Zy)    )defaultdict)Iterable)SnowballStemmerN)	Tokenizer)NDArray)
NumpyArrayc                   2    e Zd ZdedefdZdedee   fdZy)VocabTokenizerBasesentencereturnc                     t               NNotImplementedErrorselfr   s     a/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/fastembed/sparse/utils/vocab_resolver.pytokenizezVocabTokenizerBase.tokenize       !##    	token_idsc                     t               r   r   r   r   s     r   convert_ids_to_tokensz(VocabTokenizerBase.convert_ids_to_tokens   r   r   N)__name__
__module____qualname__strr   r   listr    r   r   r
   r
      s,    $ $ $$z $d3i $r   r
   c                   >    e Zd ZdefdZdedefdZdedee   fdZ	y)	VocabTokenizer	tokenizerc                     || _         y r   )r#   )r   r#   s     r   __init__zVocabTokenizer.__init__   s	    "r   r   r   c                 r    t        j                  | j                  j                  |      j                        S r   )nparrayr#   encodeidsr   s     r   r   zVocabTokenizer.tokenize   s&    xx--h7;;<<r   r   c                 ^    |D cg c]  }| j                   j                  |       c}S c c}w r   )r#   id_to_token)r   r   token_ids      r   r   z$VocabTokenizer.convert_ids_to_tokens   s%    ENO**84OOOs   "*N)
r   r   r   r   r%   r   r   r   r   r   r    r   r   r"   r"      s;    #) #= = =Pz Pd3i Pr   r"   c                   n   e Zd Zdedee   defdZdedefdZ	de
defd	Zd
edee   fdZde
fdZdeddfdZdeddfdZdeddfdZdeddfdZdeddfdZedeee
ef      deeeee
   f      fd       Zd
eej4                     deeej4                     ee
e
f   eee
f   eeee   f   f   fdZy)VocabResolverr#   	stopwordsstemmerc                 X    i | _         g | _        i | _        || _        || _        || _        y r   )vocabwordsstem_mappingr#   r1   r0   )r   r#   r0   r1   s       r   r%   zVocabResolver.__init__    s-    %'
 "
,.-6#,r   r   r   c                 8    | j                   j                  |      S r   )r#   r   r   s     r   r   zVocabResolver.tokenize+   s    ~~&&x00r   word_idc                 2    |dk(  ry| j                   |dz
     S )Nr   UNK   )r4   )r   r7   s     r   lookup_wordzVocabResolver.lookup_word.   s    a<zz'A+&&r   r   c                 8    | j                   j                  |      S r   )r#   r   r   s     r   r   z#VocabResolver.convert_ids_to_tokens3   s    ~~33I>>r   c                 2    t        | j                        dz   S Nr:   )lenr3   )r   s    r   
vocab_sizezVocabResolver.vocab_size6   s    4::""r   pathNc                     t        |d      5 }| j                  D ]  }|j                  |dz           	 d d d        y # 1 sw Y   y xY w)Nw
)openr4   write)r   rA   fwords       r   
save_vocabzVocabResolver.save_vocab:   sE    $_ 	%

 %t$%	% 	% 	%s	   &=Ac                     dd l }t        |d      5 }|j                  | j                  | j                  d|d       d d d        y # 1 sw Y   y xY w)Nr   rC   )r3   r5      )indent)jsonrE   dumpr4   r5   )r   rA   rM   rG   s       r   save_json_vocabzVocabResolver.save_json_vocab?   sL    $_ 	]II

D<M<MNPQZ[I\	] 	] 	]s   ,AAc                    dd l }t        |d      5 }|j                  |      }|d   | _        t	        | j                        D ci c]  \  }}||dz    c}}| _        |d   | _        d d d        y c c}}w # 1 sw Y   y xY w)Nr   rr3   r:   r5   )rM   rE   loadr4   	enumerater3   r5   )r   rA   rM   rG   dataidxrH   s          r   load_json_vocabzVocabResolver.load_json_vocabE   s|    $_ 	599Q<DgDJ9B4::9NOIC$a-ODJ $^ 4D		5 	5 P	5 	5s   4A7A1A71A77B rH   c                    || j                   vrt        | j                         dz   | j                   |<   | j                  j                  |       | j                  j                  |      }|| j                  vr|| j                  |<   y | j                  |   }t        |      t        |      kD  r|| j                  |<   y y y r>   )r3   r?   r4   appendr1   	stem_wordr5   )r   rH   stemexisting_words       r   add_wordzVocabResolver.add_wordN   s    tzz!"4::2DJJtJJd#<<))$/D4,,,*.!!$' $ 1 1$ 7}%D	1 /3D%%d+ 2 "r   c                     t        |d      5 }|D ]!  }| j                  |j                                # 	 d d d        y # 1 sw Y   y xY w)NrQ   )rE   r\   strip)r   rA   rG   lines       r   
load_vocabzVocabResolver.load_vocab\   sC    $_ 	, ,djjl+,	, 	, 	,s	   '>A
bpe_tokensc                    g }d}g }d}t        |      }|D ]Z  \  }}|j                  |      r|||d  z  }|j                  |       1|r|j                  ||f       g }|}|j                  |       \ |r|j                  ||f       |S )N z##)r?   
startswithrX   )	clsra   resultaccacc_idxcontinuing_subword_prefixcontinuing_subword_prefix_lenrU   tokens	            r   _reconstruct_bpezVocabResolver._reconstruct_bpea   s     /1$(!(+,E(F%$ 		$JC 9:u:;<<s#MM3.1 Gs#		$ MM3.)r   c                    | j                  |      }| j                  t        |            }t        t              }t        t              }t        t
              }|D ]*  \  }}d}	|| j                  v rd}	n|| j                  v r$| j                  |   }	||   j                  |       n|| j                  v r>| j                  | j                  |      }	|| j                  |      j                  |       nf| j                  j                  |      }
|
| j                  v r=| j                  | j                  |
      }	|| j                  |
      j                  |       |D ]  }|	||<   	 |	dk(  r||xx   dz  cc<   ||	xx   dz  cc<   - ||||fS )a  
        Mark known tokens (including composed tokens) with vocab ids.

        Args:
            token_ids: (seq_len) - list of ids of tokens
                Example:
                    [
                        101,  3897, 19332, 12718, 23348,
                        1010,  1996,  7151,  2296, 4845,
                        2359,  2005,  4234,  1010,  4332,
                        2871,  3191,  2062, 102
                    ]

            returns:
                - token_ids with vocab ids
                    [
                        0,  151, 151, 0, 0,
                        912,  0,  0,  0, 332,
                        332,  332,  0,  7121,  191,
                        0,  0,  332, 0
                    ]
                - counts of each token
                    {
                        151: 1,
                        332: 3,
                        7121: 1,
                        191: 1,
                        912: 1
                    }
                - oov counts of each token
                    {
                        "the": 1,
                        "a": 1,
                        "[CLS]": 1,
                        "[SEP]": 1,
                        ...
                    }
                - forms of each token
                    {
                        "hello": ["hello"],
                        "world": ["worlds", "world", "worlding"],
                    }

        r   r:   )r   rl   rS   r   intr   r0   r3   rX   r5   r1   rY   )r   r   tokenstokens_mappingcounts	oov_countformsrk   mapped_token_idsvocab_idrZ   r-   s               r   resolve_tokenszVocabResolver.resolve_tokens{   s   ^ ++I6..y/@A!,S!1$/$4	&1$&7'5 	&#E#H&$**$::e,e##E*$+++::d&7&7&>?d''./66u=||--e44,,,#zz$*;*;D*ABH$++D1299%@, /&.	(#/ 1}% A% x A% -	&. &)U22r   )r   r   r   r
   setr   r   r%   r   r   rn   r;   r   r   r@   rI   rO   rV   r\   r`   classmethodr   tuplerl   r   r'   int64dictrv   r    r   r   r/   r/      sf   	-"4 	-S 	-Tc 	-1 1 1'3 '3 '
?z ?d3i ?#C #%s %t %
]C ]D ]5C 5D 53S 3T 3,s ,t ,
 !%S/2	eCcN#	$ 2N3 *N3	wrxx $sCx.$sCx.$sDQTI~BVV	WN3r   r/   )collectionsr   typingr   py_rust_stemmersr   numpyr'   
tokenizersr   numpy.typingr   fastembed.common.typesr   r
   r"   r/   r    r   r   <module>r      s?    #  ,      -$ $P' Pj3 j3r   