
    1i.                        d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ  G d de      Z  G d de      Z! G d de      Z" ed       G d de             Z# G d dejH                        Z% ed       G d de"e             Z&g d Z'y)!    N)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                       e Zd Zy)VoxtralAttentionN__name__
__module____qualname__     g/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   )       r!   r   c                       e Zd Zy)VoxtralEncoderLayerNr   r    r!   r"   r%   r%   -   r#   r!   r%   c                        e Zd ZdZdZdZdZdZy)VoxtralPreTrainedModelTN)r   r   r   _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr    r!   r"   r'   r'   1   s      "&!r!   r'   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                   J    e Zd ZeedZee	 ddee	   de
ez  fd              Zy)VoxtralEncoder)
attentionshidden_statesNkwargsreturnc           	         | j                   j                  | j                  j                  d   z  | j                  j                  d   z  }|j
                  d   |k7  r"t        d| d|j
                  d    d| d      |j                  | j                  j                  j                  | j                  j                  j                        }t        j                  j                  | j                  |            }t        j                  j                  | j	                  |            }|j                  ddd	      }| j                  j                  }||z   j                  |j                        }t        j                  j!                  || j                   | j"                  
      }t%        | j&                        D ]  \  }}	 |	||      }
|
d   } | j)                  |      }t+        |      S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptraining)attention_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr8   r9   r   
functionalgelupermuteembed_positionsdropoutr;   	enumeratelayers
layer_normr	   )selfinput_featuresr<   r3   expected_seq_lengthinputs_embeds	embed_posr2   idxencoder_layerlayer_outputss              r"   forwardzVoxtralEncoder.forwardE   s   ( #kk>>ARARSTAUUX\XbXbXiXijkXll#'::IJ]I^^jkykk  AC  lD  kE  Er  sF  rG  GH  I  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--mt||VZVcVc-d"+DKK"8 	-C)-M *!,M	- 6)+
 	
r!   N)r   r   r   r   r%   _can_record_outputsr   r   r   r   tupler	   rW   r    r!   r"   r0   r0   :   sS     ',
   ,
 +,	,

 
+	+,
   ,
r!   r0   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorr>   c                 f   t         |           t        j                  |j                  j
                  |j                  j                  d      | _        t        |j                     | _        t        j                  |j                  j                  |j                  j                  d      | _        y )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rO   r>   	__class__s     r"   r`   z#VoxtralMultiModalProjector.__init__w   sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr!   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rX   )rf   rh   ri   )rO   audio_featuresr2   s      r"   rW   z"VoxtralMultiModalProjector.forward}   s2    n5/m4r!   )r   r   r   r   r`   rW   __classcell__rk   s   @r"   r\   r\   v   s    n} nr!   r\   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
e ed	
      dej                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 ddej(                  dz  dej                  dz  dej*                  dz  dej(                  dz  dedz  dej                  dz  dej(                  dz  dedz  dej(                  dz  deej*                  z  dee   defd              Z fdZ xZS )VoxtralForConditionalGenerationrJ   c                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y rX   )r_   r`   rd   
vocab_sizer   from_configrb   audio_towerr   language_modelr\   multi_modal_projector	post_initrj   s     r"   r`   z(VoxtralForConditionalGeneration.__init__   sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r!   c                 6    | j                   j                         S rX   )rv   get_input_embeddingsrO   s    r"   rz   z4VoxtralForConditionalGeneration.get_input_embeddings   s    ""7799r!   c                 :    | j                   j                  |       y rX   )rv   set_input_embeddings)rO   values     r"   r}   z4VoxtralForConditionalGeneration.set_input_embeddings   s    007r!   c                 6    | j                   j                         S rX   )rv   get_output_embeddingsr{   s    r"   r   z5VoxtralForConditionalGeneration.get_output_embeddings   s    ""88::r!   c                 :    | j                   j                  |       y rX   )rv   set_output_embeddings)rO   new_embeddingss     r"   r   z5VoxtralForConditionalGeneration.set_output_embeddings   s    11.Ar!   c                 :    | j                   j                  |       y rX   )rv   set_decoder)rO   decoders     r"   r   z+VoxtralForConditionalGeneration.set_decoder   s    ''0r!   c                 6    | j                   j                         S rX   )rv   get_decoderr{   s    r"   r   z+VoxtralForConditionalGeneration.get_decoder   s    ""..00r!   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r-   rP   r3   r4   c                      | j                   |fddi|}|j                  }|j                  d| j                  j                  j
                        }| j                  |      }||_        |S )aa  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        return_dictTr6   )ru   r=   reshaper>   rb   rc   rw   pooler_output)rO   rP   r3   audio_outputsaudio_hidden_statesaudio_embedss         r"   get_audio_featuresz2VoxtralForConditionalGeneration.get_audio_features   sn     )((TTTVT+==199"dkk>V>V>h>hi112EF&2#r!   N	input_idsr<   position_idspast_key_valuesrR   labels	use_cachecache_positionlogits_to_keepc                    | | j                         |      }||| j                  |d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```T)r   r6   )r<   r   r   rR   r   r   r   r   r    )
rz   r   r   r>   audio_token_id	unsqueezemasked_scatterrE   r9   rv   )rO   r   rP   r<   r   r   rR   r   r   r   r   r3   r   audio_token_maskoutputss                  r"   rW   z'VoxtralForConditionalGeneration.forward   s    b  7D557	BM%)*?22>t2TbbL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M ,?4+>+> 
,
)%+'))
,
 
,
 r!   c                     |j                  dd       }|j                  dd      }t        |   |i |}|s|j                  dd      s||d<   |S )NrP   is_first_iterationFr   T)popgetr_   prepare_inputs_for_generation)rO   argsr3   rP   r   model_inputsrk   s         r"   r   z=VoxtralForConditionalGeneration.prepare_inputs_for_generation	  s]      $4d;#ZZ(<eDw<dMfMVZZT%B-;L)*r!   )
NNNNNNNNNr   )r   r   r   _keep_in_fp32_modules_strictr`   rz   r}   r   r   r   r   r   r   torchFloatTensorr   r   rZ   r	   r   
LongTensorTensorr   boolintr
   rW   r   rn   ro   s   @r"   rq   rq      s    %6#6 :8;B11  w#//;ABT;U	+	+ &  .237.204(,26*.!%26-.F##d*F ))D0F t+	F
 &&-F F ((4/F   4'F $;F ((4/F ell*F +,F 
 F  FP r!   rq   )r'   r0   rq   )(r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r	   r
   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r%   r'   r0   Moduler\   rq   __all__r    r!   r"   <module>r      s       !   ) 
 ' I I 7 5 2  1	* 		0 	6  
4
& 4

4
n  
L&<o L
L^ Zr!   