
    [i~:                        U d dl mZ d dlmZmZmZmZmZmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z&m'Z' dZ(dZ)dZ* eddddd ed      de*e(e)gd	      gZ+e,e   e-d<   ddiZ.e.j_                         D  ci c]  \  } }| ja                         | c}} Z1de2de2fd Z3 G d! d"ee&e         Z4 G d# d$e'e         Z5yc c}} w )%    )Path)AnyOptionalSequenceIterableUnionTypeN)NDArray)SnowballStemmer)	Tokenizer)SparseModelDescriptionModelSource)OnnxOutputContext)OnnxProvider)define_cache_dir)SparseEmbeddingSparseTextEmbeddingBase)Encoder)SparseVectorConverterWordEmbedding)VocabResolverVocabTokenizer)OnnxTextModelTextEmbeddingWorkerzminicoil.triplet.model.npyzminicoil.triplet.model.vocabzstopwords.txtzQdrant/minicoil-v1iJ  zSparse embedding model, that resolves semantic meaning of the words, while keeping exact keyword match behavior. Based on jinaai/jina-embeddings-v2-small-en-tokensz
apache-2.0g
ףp=
?)hfzonnx/model.onnxT)	model
vocab_sizedescriptionlicense
size_in_GBsources
model_fileadditional_filesrequires_idfsupported_minicoil_modelsenglish
model_namereturnc                 0    t         | j                            S N)MODEL_TO_LANGUAGElower)r'   s    U/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/fastembed/sparse/minicoil.pyget_language_by_model_namer.   9   s    Z--/00    c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d"dedee   dee   deee      de	de	de	d	e
d
eee      de
dee   dee   def fdZd#dZ	 d$deeee   f   dededefdZ	 	 d%deeee   f   dedee   dedee   f
dZdeeee   f   dedee   fdZededee   fd       Zedee   fd       Z	 d&dede
dedee   fdZeded    fd!       Z xZS )'MiniCOILa  
        MiniCOIL is a sparse embedding model, that resolves semantic meaning of the words,
        while keeping exact keyword match behavior.

        Each vocabulary token is converted into 4d component of a sparse vector, which is then weighted by the token frequency in the corpus.
        If the token is not found in the corpus, it is treated exactly like in BM25.
    `
        The model is based on `jinaai/jina-embeddings-v2-small-en-tokens`
    r'   	cache_dirthreads	providerskbavg_lencuda
device_ids	lazy_load	device_idspecific_model_pathkwargsc                    t        |   |||fi | || _        |
| _        |	| _        || _        || _        | j                  |      | _        || _	        || _
        || _        d| _        i | _        t               | _        t               | _        t               | _        d| _        d| _        d| _        d| _        | j-                  |      | _        t1        t3        |            | _        || _        | j9                  | j.                  | j4                  | j:                  | j6                        | _        | j                  s| j?                          yy)a  
        Args:
            model_name (str): The name of the model to use.
            cache_dir (str, optional): The path to the cache directory.
                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
                                       Defaults to `fastembed_cache` in the system's temp directory.
            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
            providers (Optional[Sequence[OnnxProvider]], optional): The providers to use for onnxruntime.
            k (float, optional): The k parameter in the BM25 formula. Defines the saturation of the term frequency.
                I.e. defines how fast the moment when additional terms stop to increase the score. Defaults to 1.2.
            b (float, optional): The b parameter in the BM25 formula. Defines the importance of the document length.
                Defaults to 0.75.
            avg_len (float, optional): The average length of the documents in the corpus. Defaults to 150.0.
            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
                Defaults to False.
            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
            specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else

        Raises:
            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
        N)local_files_onlyr<   ) super__init__r4   r:   r9   r8   r;   _select_exposed_session_options_extra_session_optionsr5   r6   r7   	tokenizerinvert_vocabsetspecial_tokensspecial_tokens_ids	stopwordsvocab_resolverencoder
output_dimsparse_vector_converter_get_model_descriptionmodel_descriptionstrr   r2   _specific_model_pathdownload_model_local_files_only
_model_dirload_onnx_model)selfr'   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   	__class__s                 r-   rA   zMiniCOIL.__init__H   s3   T 	YB6B""$	"&*&J&J6&R# /3,.(+,/E#&57;*.)-HL$!%!<!<Z!H-i89$7!--""NN!33 $ 9 9	 . 
 ~~  " r/   r(   c           	         | j                  | j                  | j                  j                  | j                  | j
                  | j                  | j                  | j                         | j                  J | j                  j                         j                         D ]  \  }}|| j                  |<    t        | j                  j                               | _        t        | j                  j#                               | _        t        | j'                  | j                              | _        t+        t-        | j.                              }t1        t3        | j                        | j(                  |      | _        | j4                  j7                  t9        | j                  t:        z               t=        j>                  t9        | j                  t@        z        d      }tC        |      | _"        | jD                  jF                  | _#        tI        | j(                  || jJ                  | jL                  | jN                        | _(        y )N)	model_dirr"   r3   r4   r8   r;   extra_session_options)rD   rI   stemmerr)	mmap_mode)rI   r[   r5   r6   r7   ))_load_onnx_modelrT   rO   r"   r3   r4   r8   r;   rC   rD   	get_vocabitemsrE   rF   special_token_to_idkeysrG   valuesrH   _load_stopwordsrI   r   r.   r'   r   r   rJ   load_json_vocabrP   MINICOIL_VOCAB_FILEnploadMINICOIL_MODEL_FILEr   rK   rL   r   r5   r6   r7   rM   )rV   tokenidxr[   weightss        r-   rU   zMiniCOIL.load_onnx_model   s   oo--88LLnnnn"&"="= 	 	
 ~~)))..224::< 	+JE3%*Dc"	+!$":":"?"?"AB"%d&>&>&E&E&G"HT11$//BC!"<T__"MN+$T^^4nn

 	++CBU0U,VW''#doo0CCDPSTw',,11'<nnffffLL(
$r/   texts
batch_sizec                 ,     | j                   |fd|i|S )Nrn   )_token_count)rV   rm   rn   r=   s       r-   token_countzMiniCOIL.token_count   s!     !t  H:HHHr/   	documentsparallelc              +   R  K    | j                   d| j                  t        | j                        |||| j                  | j
                  | j                  | j                  | j                  | j                  d| j                  | j                  | j                  d|E d{    y7 w)a  
        Encode a list of documents into list of embeddings.
        We use mean pooling with attention so that the model can handle variable-length inputs.

        Args:
            documents: Iterator of documents or single document to embed
            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
            parallel:
                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
                If 0, use all available cores.
                If None, don't use data-parallel processing, use default onnxruntime threading instead.

        Returns:
            List of embeddings, one per document
        F)r'   r2   rr   rn   rs   r4   r8   r9   r5   r6   r7   is_queryr?   r<   rZ   N )_embed_documentsr'   rP   r2   r4   r8   r9   r5   r6   r7   rS   rQ   rC   )rV   rr   rn   rs   r=   s        r-   embedzMiniCOIL.embed   s     , )4(( 
$..)!nnffffLL!33 $ 9 9"&"="=
  !
 	
 	
s   BB'B% B'queryc              +   8  K    | j                   d| j                  t        | j                        || j                  | j
                  | j                  | j                  | j                  | j                  d| j                  | j                  d|E d{    y7 w)zC
        Encode a list of queries into list of embeddings.
        T)r'   r2   rr   r4   r8   r9   r5   r6   r7   ru   r?   r<   Nrv   )rw   r'   rP   r2   r4   r8   r9   r5   r6   r7   rS   rQ   )rV   ry   r=   s      r-   query_embedzMiniCOIL.query_embed   s      )4(( 
$..)nnffffLL!33 $ 9 9
 
 	
 	
s   BBBBrY   c                     |t         z  }|j                         sg S t        |d      5 }|j                         j	                         cd d d        S # 1 sw Y   y xY w)Nr\   )STOPWORDS_FILEexistsopenread
splitlines)clsrY   stopwords_pathfs       r-   rd   zMiniCOIL._load_stopwords  sP    "^3$$&I.#& 	)!668&&(	) 	) 	)s   AAc                     t         S )zLists the supported models.

        Returns:
            list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information.
        )r%   r   s    r-   _list_supported_modelszMiniCOIL._list_supported_models  s
     )(r/   outputru   c           
   +     K   |j                   t        d      | j                  J | j                  J | j                  J |j
                  }|j                  J |j                  }| j                  j                         }| j                  j                  }t        |j                  d         D ]  }||||   dk(  f   }	|j                   |||   dk(  f   }
| j                  j                  |
      \  }}}}t        j                  |d      }t        j                  |	d      }|j                  d   |j                  d   k(  sJ | j                  j                  ||      \  }}|d d df   j                         }i }|D cg c]  }| j                  j!                  |       }}t#        |||j                               D ]7  \  }}}|dk(  rt%        |||   t'        ||         t'        |      |      ||<   9 |j)                         D ]#  \  }}t%        ||gt'        |      ddg      ||<   % |s"| j                  j+                  |||       | j                  j-                  |||        y c c}w w)Nz7input_ids must be provided for document post-processingr      )axis)wordformscountword_id	embedding)r   embedding_size)	input_ids
ValueErrorrJ   rK   rM   model_outputattention_maskr   rL   rangeshaperesolve_tokensrg   expand_dimsforwardtolistlookup_wordzipr   intr`   embedding_to_vectorembedding_to_vector_query)rV   r   ru   r=   
embeddingsmasksr   r   itoken_embeddings	token_idsword_ids_arraycountsoovr   word_ids_array_expandedtoken_embeddings_arrayids_mappingminicoil_embeddings	words_idssentence_resultr   wordsr   emboov_wordr   s                              r-   _post_process_onnx_outputz"MiniCOIL._post_process_onnx_output  s     #VWW""...||'''++777 ((
$$000%%((335
00 z''*+ =	A)!U1X]*:; ,2+;+;AuQx1}<L+MI151D1D1S1ST]1^.NFC :<]^9_# ;=..IY`a:b"*0037M7S7STU7VVVV 04||/C/C')?0,K,
 $/q!t#4#;#;#=I8:OMVW'T((44W=WEW&)%<O<V<V<X&Y 
"gsa<(5+fWo.L!)%	
 $'99; 
% -:!(3u:r^_]`-)
 22FF#
> G   22LL#
> M  w=	8 Xs   FJ"I<'CJMiniCoilTextEmbeddingWorkerc                     t         S r*   )r   r   s    r-   _get_worker_classzMiniCOIL._get_worker_classh  s    **r/   )NNNg333333?g      ?g     b@FNFNN)r(   N)i   )   N)F)__name__
__module____qualname____doc__rP   r   r   r   r   floatboollistr   rA   rU   r   r   rq   r   rx   r{   classmethodr   rd   r   r   r   r   r	   r   __classcell__)rW   s   @r-   r1   r1   =   sJ    $(!%6:*.#'-1L#L# C=L# #	L#
 H\23L# L# L# L# L# T#Y'L# L# C=L# &c]L# L#\&
R CGI3-.I<?IRUI	I "&	'
hsm+,'
 '
 3-	'

 '
 
/	"'
R
3-.
:=
	/	"
, ) )c ) ) )t,B'C ) ) ;@R'R37RKNR	/	"Rh +$'D"E + +r/   r1   c                   $    e Zd ZdedededefdZy)r   r'   r2   r=   r(   c                      t        d||dd|S )Nr   )r'   r2   r3   rv   )r1   )rV   r'   r2   r=   s       r-   init_embeddingz*MiniCoilTextEmbeddingWorker.init_embeddingn  s'     
!
 	
 	
r/   N)r   r   r   rP   r   r1   r   rv   r/   r-   r   r   m  s$    
 
 
 
PX 
r/   r   )6pathlibr   typingr   r   r   r   r   r	   numpyrg   numpy.typingr
   py_rust_stemmersr   
tokenizersr   "fastembed.common.model_descriptionr   r   fastembed.common.onnx_modelr   fastembed.commonr   fastembed.common.utilsr   &fastembed.sparse.sparse_embedding_baser   r   'fastembed.sparse.utils.minicoil_encoderr   /fastembed.sparse.utils.sparse_vectors_converterr   r   %fastembed.sparse.utils.vocab_resolverr   r   fastembed.text.onnx_text_modelr   r   ri   rf   r}   r%   r   __annotations___MODEL_TO_LANGUAGEr`   r,   r+   rP   r.   r1   r   )r'   languages   00r-   <module>r      s"    A A    ,   R 9 ) 3 < ` O M 3 4   "= 34$

 ; 4 67 * )  >P=U=U=W%9ZJ  
13 13 1m+&o(F m+`	
"5o"F 
s	s   /C4