
    0i(U                        d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$  ejJ                  e&      Z'	 	 d+dejP                  dejR                  dejR                  dejR                  dejR                  dz  de*dz  de*fdZ+ G d dejP                        Z, G d de      Z-e G d d e             Z. ed!"       G d# d$e.             Z/ G d% d&ejP                        Z0 ed'"       G d( d)e.e
             Z1g d*Z2y),    N)Callable)nn   )ACT2FN)Cache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )VoxtralConfigVoxtralEncoderConfigmodulequerykeyvalueattention_maskscalingdropoutc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N      r   r   )dimptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr"   r)   
contiguous)
r   r   r   r   r    r!   r"   kwargsattn_weightsattn_outputs
             h/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/voxtral/modeling_voxtral.pyeager_attention_forwardr5   -   s     **R.D(<<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$    c                   ,    e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZde	j                  dedefdZ	 	 dde	j                  de	j                  dz  dedee	j                  e	j                  dz  ee	j                     dz  f   fdZ xZS )VoxtralAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr"   
is_decoderbias	is_causal	layer_idxconfigc	                 z   t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        |/|r-t        j                  d| j                  j                   d       || _        t!        j"                  ||d      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r%   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr<   )super__init__r9   r:   r"   head_dimr?   
ValueErrorr!   r;   r=   loggerwarning_once	__class____name__r>   r   Lineark_projv_projq_projout_proj)
selfr9   r:   r"   r;   r<   r=   r>   r?   rH   s
            r4   rC   zVoxtralAttention.__init__J   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"*4>>+B+B*C D, ,
 #ii	95Aii	94@ii	94@		)YTBr6   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r   )viewr:   rD   r-   r0   )rO   rP   rQ   rR   s       r4   _shapezVoxtralAttention._shaper   s7    {{3GQQRSUVWbbddr6   hidden_statesr    output_attentionsreturnc                 0   |j                         \  }}}| j                  | j                  |      | j                  z  ||      }| j                  | j	                  |      d|      }	| j                  | j                  |      d|      }
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  d|d|\  }}|j                  ||d      j                         }| j                  |      }||fS )z#Input shape: Batch x Time x Channelr$                 ?)r"   r!   rW   )r*   rU   rM   r!   rK   rL   r   get_interfacer?   _attn_implementationr5   r)   r"   reshaper0   rN   )rO   rV   r    rW   r1   rR   tgt_len_query_states
key_statesvalue_statesattention_interfacer3   r2   s                 r4   forwardzVoxtralAttention.forwardu   s    (,,.Wa {{4;;}#=#LgWZ[[[]!;RE
{{4;;}#=r3G(?(M(MKK,,.E)
 %8
%
  $}}C$,,/
%
 
%
!\ "))#w;FFHmmK0L((r6   )rZ   FTFNNNF)rI   
__module____qualname____doc__intfloatboolr   rC   r+   TensorrU   tuplere   __classcell__rH   s   @r4   r8   r8   G   s	   G   $'+&C&C &C 	&C
 &C &C &C :&C $&CPeU\\ eC ec e /3"'	')||') t+')  	') 
u||U\\D0%2E2LL	M')r6   r8   c            	       |     e Zd Zdef fdZ	 ddej                  dej                  dedej                  fdZ xZ	S )	VoxtralEncoderLayerr?   c                 h   t         |           |j                  | _        t	        | j                  |j
                  |j                  |      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r9   r:   r"   r?   )rB   rC   d_modelr9   r8   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr"   r   activation_functionactivation_fnactivation_dropoutrJ   encoder_ffn_dimfc1fc2final_layer_normrO   r?   rH   s     r4   rC   zVoxtralEncoderLayer.__init__   s    )nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r6   rV   r    rW   rX   c                    |}| j                  |      }| j                  |||      \  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|j                  t        j                  k(  rEt        j                  |j                        j                  dz
  }t        j                   || |      }||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rV   r    rW   r'   i  )minmax)ry   rw   r   r.   r"   r)   r   r{   r~   r|   r   dtyper+   float16finfor   clamp)rO   rV   r    rW   residualr2   clamp_values          r4   re   zVoxtralEncoderLayer.forward   sP    !11-@&*nn')/ '5 '
#|
 --mt||VZVcVc-d =0 --m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0%--/++m&9&9:>>EK!KKK<[YMl**r6   )F)
rI   rg   rh   r   rC   r+   rm   rl   re   ro   rp   s   @r4   rr   rr      sK    =} =, #(	%+||%+ %+  	%+
 
%+r6   rr   c                   D    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZdZdZy)VoxtralPreTrainedModelr?   model)audiotextTNpast_key_values)rI   rg   rh   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph r6   r4   r   r      sI    (&*#"3N "&!r6   r   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                        e Zd ZU dZeed<   dZdZdgZe	e
dZdef fdZd Zd	ej                  fd
Zdej                  fdZee	 ddee   d	eez  fd              Zdej4                  fdZ xZS )VoxtralEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`VoxtralEncoderLayer`].

    Args:
        config: VoxtralEncoderConfig
    r?   input_featuresr   rr   )
attentionsrV   c                 b   t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _        |j                  rt        j                  |      nd| _        t        j                  | j                  |dd      | _        t        j                  ||ddd      | _        t        j                   | j                  |      | _        | j"                  j%                  d       t        j&                  t)        |j*                        D cg c]  }t-        |       c}      | _        t        j0                  |j
                        | _        t        j4                  dd      | _        d| _        | j;                          y c c}w )	Nr[   r   r   )kernel_sizepaddingr   )r   strider   F)r   )rB   rC   r"   encoder_layerdrop	layerdroprt   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2	Embeddingembed_positionsrequires_grad_
ModuleListrangeencoder_layersrr   layersrx   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rO   r?   r9   r`   rH   s       r4   rC   zVoxtralEncoder.__init__  s6    ~~11NN	"//$*$?$?!393I3I499Y/sYYt00)TUV
YYy)1VWX
!||D,E,EyQ++E2mm%PVPePeJf$gQ%8%@$gh,,v~~6,,q3&+# %hs   6F,c                 J    | j                         D ]	  }d|_         d| _        y rf   )
parametersrequires_grad_requires_grad)rO   params     r4   _freeze_parametersz!VoxtralEncoder._freeze_parameters  s(    __& 	(E"'E	(#r6   rX   c                     | j                   S Nr   rO   s    r4   get_input_embeddingsz#VoxtralEncoder.get_input_embeddings   s    zzr6   r   c                     || _         y r   r   rO   r   s     r4   set_input_embeddingsz#VoxtralEncoder.set_input_embeddings#  s	    
r6   r1   c           	         | j                   j                  | j                  j                  d   z  | j                  j                  d   z  }|j
                  d   |k7  r"t        d| d|j
                  d    d| d      |j                  | j                  j                  j                  | j                  j                  j                        }t        j                  j                  | j                  |            }t        j                  j                  | j	                  |            }|j                  ddd	      }| j                  j                  }||z   j                  |j                        }t        j                  j!                  || j                   | j"                  
      }t%        | j&                        D ]  \  }}	 |	||      }
|
d   } | j)                  |      }t+        |      S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   r$   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)r   devicer   r   r'   )r    )last_hidden_state)r?   r   r   r   r   shaperE   toweightr   r   r   r.   gelupermuter   r"   r)   	enumerater   r   r   )rO   r   r    r1   expected_seq_lengthinputs_embeds	embed_posrV   idxencoder_layerlayer_outputss              r4   re   zVoxtralEncoder.forward&  s   ( #kk>>ARARSTAUUX\XbXbXiXijkXll#'::IJ]I^^jkykk  AC  lD  kE  Er  sF  rG  GH  I  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--mt||VZVcVc-d"+DKK"8 	-C)-M *!,M	- 6)+
 	
r6   input_lengthsc                 6    |dz
  dz  dz   }|dz
  dz  dz   }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r   )rO   r   output_lengthss      r4    _get_feat_extract_output_lengthsz/VoxtralEncoder._get_feat_extract_output_lengthsW  s7     '*q014'!+1A5n,,r6   r   )rI   rg   rh   ri   r   r   main_input_namer   r   r8   rr   _can_record_outputsrC   r   r   Moduler   r   r   r   r   r   rn   r   re   r+   
LongTensorr   ro   rp   s   @r4   r   r      s     ! &O./&,
3 2$
bii "))    ,
 +,	,

 
+	+,
   ,
^-e>N>N -r6   r   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorr?   c                 f   t         |           t        j                  |j                  j
                  |j                  j                  d      | _        t        |j                     | _        t        j                  |j                  j                  |j                  j                  d      | _        y )NFrA   )rB   rC   r   rJ   audio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2r   s     r4   rC   z#VoxtralMultiModalProjector.__init__a  sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr6   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rO   audio_featuresrV   s      r4   re   z"VoxtralMultiModalProjector.forwardg  s2    n5/m4r6   )rI   rg   rh   r   rC   re   ro   rp   s   @r4   r   r   `  s    n} nr6   r   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
e ed	
      dej                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 ddej(                  dz  dej                  dz  dej*                  dz  dej(                  dz  dedz  dej                  dz  dej(                  dz  dedz  dej(                  dz  deej*                  z  dee   defd              Z fdZ xZS )VoxtralForConditionalGenerationr   c                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y r   )rB   rC   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r4   rC   z(VoxtralForConditionalGeneration.__init__v  sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r6   c                 6    | j                   j                         S r   )r   r   r   s    r4   r   z4VoxtralForConditionalGeneration.get_input_embeddings  s    ""7799r6   c                 :    | j                   j                  |       y r   )r   r   r   s     r4   r   z4VoxtralForConditionalGeneration.set_input_embeddings  s    007r6   c                 6    | j                   j                         S r   )r   get_output_embeddingsr   s    r4   r   z5VoxtralForConditionalGeneration.get_output_embeddings  s    ""88::r6   c                 :    | j                   j                  |       y r   )r   set_output_embeddings)rO   new_embeddingss     r4   r   z5VoxtralForConditionalGeneration.set_output_embeddings  s    11.Ar6   c                 :    | j                   j                  |       y r   )r   set_decoder)rO   decoders     r4   r  z+VoxtralForConditionalGeneration.set_decoder  s    ''0r6   c                 6    | j                   j                         S r   )r   get_decoderr   s    r4   r  z+VoxtralForConditionalGeneration.get_decoder  s    ""..00r6   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   r   r1   rX   c                      | j                   |fddi|}|j                  }|j                  d| j                  j                  j
                        }| j                  |      }||_        |S )aa  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        return_dictTr$   )r   r   r^   r?   r   r   r   pooler_output)rO   r   r1   audio_outputsaudio_hidden_statesaudio_embedss         r4   get_audio_featuresz2VoxtralForConditionalGeneration.get_audio_features  sn     )((TTTVT+==199"dkk>V>V>h>hi112EF&2#r6   N	input_idsr    position_idsr   r   labels	use_cachecache_positionlogits_to_keepc                    | | j                         |      }||| j                  |d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```T)r  r$   )r    r  r   r   r  r  r  r  r   )
r   r  r  r?   audio_token_id	unsqueezemasked_scatterr   r   r   )rO   r  r   r    r  r   r   r  r  r  r  r1   r  audio_token_maskoutputss                  r4   re   z'VoxtralForConditionalGeneration.forward  s    b  7D557	BM%)*?22>t2TbbL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M ,?4+>+> 
,
)%+'))
,
 
,
 r6   c                     |j                  dd       }|j                  dd      }t        |   |i |}|s|j                  dd      s||d<   |S )Nr   is_first_iterationFr  T)popgetrB   prepare_inputs_for_generation)rO   argsr1   r   r  model_inputsrH   s         r4   r  z=VoxtralForConditionalGeneration.prepare_inputs_for_generation  s]      $4d;#ZZ(<eDw<dMfMVZZT%B-;L)*r6   )
NNNNNNNNNr   )rI   rg   rh   _keep_in_fp32_modules_strictrC   r   r   r   r   r  r  r   r   r+   FloatTensorr   r   rn   r   r  r   rm   r   rl   rj   r   re   r  ro   rp   s   @r4   r   r   n  s    %6#6 :8;B11  w#//;ABT;U	+	+ &  .237.204(,26*.!%26-.F##d*F ))D0F t+	F
 &&-F F ((4/F   4'F $;F ((4/F ell*F +,F 
 F  FP r6   r   )r   r   r   )NrZ   )3r   collections.abcr   r+   r   activationsr   cache_utilsr   
generationr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   autor   r   configuration_voxtralr   r   
get_loggerrI   rF   r   rm   rk   r5   r8   rr   r   r   r   r   __all__r   r6   r4   <module>r1     sg  ,  $   !   ) 9 k k F & R R 7 5 2 F 
		H	% !%II%<<% 
% <<	%
 LL4'% T\% %4U)ryy U)p8+4 8+v "_ " " 
n-+ n-
n-b  
L&<o L
L^ Zr6   