
    [i$                         d dl mZmZmZ d dlmZ d dlmZmZ d dl	Z	d dl
Z
d dlmZ d dlZd dlmZ dZdZe G d	 d
             Z G d d      Zy)    )DictListSet)SnowballStemmer)get_all_punctuationremove_non_alphanumericN)	dataclass)SparseEmbeddingi }  ic                   L    e Zd ZU eed<   ee   ed<   eed<   eed<   ee   ed<   y)WordEmbeddingwordformscountword_id	embeddingN)__name__
__module____qualname__str__annotations__r   intfloat     k/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/fastembed/sparse/utils/sparse_vectors_converter.pyr   r      s%    
I9JLE{r   r   c                       e Zd Z	 	 	 ddee   dedededef
dZedede	d	e	fd
       Z
de	de	d	efdZedee   d	ee   fd       Z	 ddeeef   de	d	eeef   fdZdeeef   de	de	d	efdZdeeef   de	de	d	efdZy)SparseVectorConverter	stopwordsstemmerkbavg_lenc                     t        t                     }h d}|| _        ||z  |z  | _        || _        || _        || _        y )N>   [CLS][PAD][SEP][UNK][MASK])setr   r   unwanted_tokensr    r!   r"   )selfr   r   r    r!   r"   punctuationspecial_tokenss           r   __init__zSparseVectorConverter.__init__   sE     -/0G*^;iGr   r   shiftreturnc                 d    t        t        j                  |            }t        |z
  }|||z  z   }|S )N)absmmh3hash	INT32_MAX)clsr   r/   
token_hash
range_sizeremapped_hashs         r   unkn_word_token_idz(SparseVectorConverter.unkn_word_token_id,   s5     4)
&
j!89r   num_occurrencessentence_lenc                     || j                   dz   z  }||| j                   d| j                  z
  | j                  |z  | j                  z  z   z  z   z  }|S )N   )r    r!   r"   )r+   r;   r<   ress       r   bm25_tfzSparseVectorConverter.bm25_tf7   sS    !,1tvv:8MPTP\P\8\+\!]]]
r   vectorc                     t        |D cg c]  }|dz  	 c}      dz  }|dk  r|S |D cg c]  }||z  	 c}S c c}w c c}w )N   g      ?g:0yE>)sum)r6   rA   xnorms       r   normalize_vectorz&SparseVectorConverter.normalize_vector<   sK    &)QAqD)*c1$;M"()QD)) * *s   :?sentence_embeddingtoken_max_lengthc                 H   i }|j                         D ]  \  }}|j                  dkD  r|||<   || j                  v r+t        |      j	                         }t        |      dkD  sS|j                         D ]  }| j                  j                  |      }t        |      |k  s-|| j                  vs<||vr#t        j                  |      ||<   |||   _        c||   xj                  |j                  z  c_        ||   xj                  |j                  z  c_          |S )u  
        Clean miniCOIL-produced sentence_embedding, as unknown to the miniCOIL's stemmer tokens should fully resemble
        our BM25 token representation.

        sentence_embedding = {"9°": {"word": "9°", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9°"]},
                "9": {"word": "9", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9"]},
                "bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]},
                "9°9": {"word": "9°9", "word_id": -1, "count": 1, "embedding": [1], "forms": ["9°9"]},
                "screech": {"word": "screech", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screech"]},
                "screeched": {"word": "screeched", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screeched"]}
                }
        cleaned_embedding_ground_truth = {
                "9": {"word": "9", "word_id": -1, "count": 6, "embedding": [1], "forms": ["9°", "9", "9°9", "9°9"]},
                "bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]},
                "screech": {"word": "screech", "word_id": -1, "count": 2, "embedding": [1], "forms": ["screech", "screeched"]}
                }
        r   )itemsr   r*   r   striplensplitr   	stem_wordcopydeepcopyr   r   r   )	r+   rH   rI   new_sentence_embeddingr   r   word_cleanedsubwordstemmed_subwords	            r   clean_wordsz!SparseVectorConverter.clean_wordsC   s3   * <>1779 "	aOD)   1$/8&t, 4///  7t<BBD |$q(#/#5#5#7 a/3||/E/Eg/N04DD /t7K7K K.6LLJN--XaJb 6 GO^ 6 G L 6 G M MQZQ`Q` ` M 6 G M MQZQ`Q` ` Ma/"	aH &%r   embedding_size
vocab_sizec                    g }g }||z  t         z  dz   t         z  }| j                  |      }d}|j                         D ]  }	||	j                  z  } |j                         D ]  }	|	j                  }
|	j                  }| j                  ||      }|
dkD  r\|	j                  }| j                  |      }t        |      D ]0  \  }}|j                  |
|z  |z          |j                  ||z         2 |j                  | j                  |	j                  |             |j                  |        t        t        j                  |t        j                        t        j                  |t        j                               S )a  
        Convert miniCOIL sentence embedding to Qdrant sparse vector

        Example input:

        ```
        {
            "vector": WordEmbedding({ // Vocabulary word, encoded with miniCOIL normally
                "word": "vector",
                "forms": ["vector", "vectors"],
                "count": 2,
                "word_id": 1231,
                "embedding": [0.1, 0.2, 0.3, 0.4]
            }),
            "axiotic": WordEmbedding({ // Out-of-vocabulary word, fallback to BM25
                "word": "axiotic",
                "forms": ["axiotics"],
                "count": 1,
                "word_id": -1,
            })
        }
        ```

        rC   r   dtypeindicesvalues)GAPrV   r^   r   r   r@   r   rG   	enumerateappendr:   r   r
   nparrayint32float32)r+   rH   rW   rX   r]   r^   unknown_words_shiftsentence_embedding_cleanedr<   r   r   r;   tfembedding_valuesnormalized_embeddingval_idvalues                    r   embedding_to_vectorz)SparseVectorConverter.embedding_to_vector   sv   >    .(S014 &*%5%56H%I" 3::< 	,IIOO+L	, 4::< 	"I''G'ooOo|<B! $-#6#6 '+'<'<=M'N$%./C%D .MFENN.069 MM%"*-	. t66y~~GZ[\b!%	"( HHWBHH588F"**5
 	
r   c                 ~   g }g }||z  t         z  dz   t         z  }| j                  |      }|j                         D ]  }|j                  }	d}
|	dk\  r\|j                  }| j                  |      }t        |      D ]0  \  }}|j                  |	|z  |z          |j                  ||
z         2 r|j                  | j                  |j                  |             |j                  |
        t        t        j                  |t        j                        t        j                  |t        j                              S )z:
        Same as `embedding_to_vector`, but no TF
        rC   g      ?r   rZ   r\   )r_   rV   r^   r   r   rG   r`   ra   r:   r   r
   rb   rc   rd   re   )r+   rH   rW   rX   r]   r^   rf   rg   r   r   rh   ri   rj   rk   rl   s                  r   embedding_to_vector_queryz/SparseVectorConverter.embedding_to_vector_query   s,       !+^ ;CaG3N%)%5%56H%I"3::< 	"I''GB!|#,#6#6 '+'<'<=M'N$%./C%D .MFENN.069 MM%"*-	. t66y~~GZ[\b!	"" HHWBHH588F"**5
 	
r   N)g333333?g      ?g     b@)(   )r   r   r   r   r   r   r   r.   classmethodr   r:   r@   r   rG   r   r   rV   r
   rm   ro   r   r   r   r   r      s_   
 s8 ! 	
  $ "	 s # % 
 *d5k *d5k * * UW;&"&sM'9":;&NQ;&	c= 	!;&zO
 m!34O
 O
 	O

 
O
b&
 m!34&
 &
 	&

 
&
r   r   )typingr   r   r   py_rust_stemmersr   fastembed.common.utilsr   r   r3   rP   dataclassesr	   numpyrb   &fastembed.sparse.sparse_embedding_baser
   r_   r5   r   r   r   r   r   <module>rx      sO    " " , O   !  B	   ^
 ^
r   