
    [i                     0    d Z ddlZddlmZ  G d d      Zy)z
Pure numpy implementation of encoder model for a single word.

This model is not trainable, and should only be used for inference.
    N)
NumpyArrayc            	       |    e Zd ZdZdefdZededefd       Zededede	eef   fd       Z
dedede	eef   fd	Zy
)Encoderuo  
    Encoder(768, 4, 10000)

    Will look like this:


                                         Per-word
                                         Encoder Matrix
     ┌─────────────────────┐
     │ Token Embedding(768)├──────┐      (10k, 768, 4)
     └─────────────────────┘      │         ┌─────────┐
                                  │         │         │
     ┌─────────────────────┐      │       ┌─┴───────┐ │
     │                     │      │       │         │ │
     └─────────────────────┘      │     ┌─┴───────┐ │ │      ┌─────────┐
                                  └────►│         │ │ ├─────►│Tanh     │
     ┌─────────────────────┐            │         │ │ │      └─────────┘
     │                     │            │         │ ├─┘
     └─────────────────────┘            │         ├─┘
                                        │         │
     ┌─────────────────────┐            └─────────┘
     │                     │
     └─────────────────────┘

     Final linear transformation is accompanied by a non-linear activation function: Tanh.

     Tanh is used to ensure that the output is in the range [-1, 1].
     It would be easier to visually interpret the output of the model, assuming that each dimension
     would need to encode a type of semantic cluster.
    weightsc                     || _         |j                  \  | _        | _        | _        || _        t        j                  | _        y )N)	r   shape
vocab_size	input_dim
output_dimencoder_weightsnptanh
activation)selfr   s     c/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/fastembed/sparse/utils/minicoil_encoder.py__init__zEncoder.__init__+   s6     ;B==8+2 ''    	vocab_idsreturnc                 "   | j                   \  }}t        j                  || j                        j	                  |d      }t        j
                  ||d      }t        j                  | |fd      j                  t        j                        }|S )z
        Convert vocab_ids of shape (batch_size, seq_len) into (batch_size, seq_len, 2)
        by appending batch_id alongside each vocab_id.
        dtype   )axis   )	r   r   aranger   reshaperepeatstackastypeint32)r   
batch_sizeseq_len	batch_idscombineds        r   convert_vocab_idszEncoder.convert_vocab_ids7   so     (oo
GIIj	@HHUVW	IIiq9	!xxI(>QGNNrxxXr   
embeddingsc                    |j                   d   }| j                  |      j                  dd      }|j                  d|      }t        j                  |dd      \  }}|j                   d   }t        j
                  ||ft        j                        }	t        j
                  |t        j                        }
t        j                  j                  |	||       t        j                  j                  |
|d       |	|
dddf   z  }	|j                  t        j                        |	j                  t        j                        fS )	aU  
        Takes:
            vocab_ids: (batch_size, seq_len) int array
            embeddings: (batch_size, seq_len, input_dim) float array

        Returns:
            unique_flattened_vocab_ids: (total_unique, 2) array of [vocab_id, batch_id]
            unique_flattened_embeddings: (total_unique, input_dim) averaged embeddings
        r   r   T)r   return_inverser   r   N)r   r&   r   r   uniquezerosfloat32r!   addatr    )clsr   r'   r
   flattened_vocab_idsflattened_embeddingsunique_flattened_vocab_idsinverse_indicesunique_countunique_flattened_embeddingsunique_flattened_counts              r   avg_by_vocab_idszEncoder.avg_by_vocab_idsD   s    $$Q'	 "33I>FFr1M  *11"i@ 79iia7
3"O
 277:&(hhi/HPRPZPZ&[#!#,bhh!G 			-@TU
		(/1= 	$'=ag'FF#)00:<W<^<^JJ=
 
 	
r   c                 b   | j                  ||      \  }}|dddf   j                  t        j                        }| j                  |   }t        j
                  d||      }| j                  |      j                  t        j                        }|j                  t        j                        |fS )a.  
        Args:
            vocab_ids: (batch_size, seq_len) int array
            embeddings: (batch_size, seq_len, input_dim) float array

        Returns:
            unique_flattened_vocab_ids_and_batch_ids: (total_unique, 2)
            unique_flattened_encoded: (total_unique, output_dim)
        Nr   z
bi,bio->bo)r8   r    r   r!   r   einsumr   r-   )r   r   r'   (unique_flattened_vocab_ids_and_batch_idsr6   r3   unique_encoder_weightsunique_flattened_encodeds           r   forwardzEncoder.forwardo   s     !!)Z8 	N02M
 &NaQRd%S%Z%ZHH&
"
 "&!5!56P!Q
 $&9957M$
 
 $(??3K#L#S#STVT^T^#_ 7>>rxxHJbbbr   N)__name__
__module____qualname____doc__r   r   staticmethodr&   classmethodtupler8   r>    r   r   r   r      s    >
"
" 
Z 
J 
 
 (
"(
0:(
	z:%	&(
 (
T#c##c1;#c	z:%	&#cr   r   )rB   numpyr   fastembed.common.typesr   r   rF   r   r   <module>rI      s      -Gc Gcr   