
    iX                     F   d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZ  ed       G d dej>                               Z  G d dej>                        Z! G d dej>                        Z"e ed       G d de                    Z#e ed       G d de                    Z$e G d  d!e             Z% ed"       G d# d$e%             Z& ed%       G d& d'e%e	             Z'g d(Z(y))    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringtorch_compilable_check)can_return_tuplemerge_with_config_defaults   )	AutoModel   )Mistral3ConfigRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	Mistral3RMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z>
        Mistral3RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer   	__class__s      j/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/mistral3/modeling_mistral3.pyr   zMistral3RMSNorm.__init__*   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   T)keepdim)	dtypetor!   float32powmeanrsqrtr$   r#   )r%   r*   input_dtypevariances       r(   forwardzMistral3RMSNorm.forward2   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r)   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler#   shaper$   r%   s    r(   
extra_reprzMistral3RMSNorm.extra_repr9   s*    ))*+6$2G2G1HIIr)   )gư>)
__name__
__module____qualname__floatr   r!   Tensorr6   r;   __classcell__r'   s   @r(   r   r   (   s7    $ $$ $;U\\ ;ell ;Jr)   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                 "   t         |           || _        |j                  j                  }|j
                  | _        | j                  j                  j                  | _        t        j                  || j
                  dz  z  |d      | _	        y )Nr   Fbias)
r   r   rE   vision_configr&   spatial_merge_size
patch_sizer   Linearmerging_layer)r%   rE   r&   r'   s      r(   r   zMistral3PatchMerger.__init__B   sr    **66"(";";++33>>YY{T5L5La5O'OQ\chir)   image_featuresimage_sizesr   c                    |D cg c]&  }|d   | j                   z  |d   | j                   z  f( }}|D cg c]
  \  }}||z   }}}|j                  d   }g }t        |j                  |            D ]  \  }	}
||	   \  }}|
j	                  |||      j                  ddd      j                  d      }t        j                  j                  j                  || j                  | j                        }|j	                  || j                  dz  z  d      j                         }|j                  |        t        j                  |d      }| j                  |      }|S c c}w c c}}w )Nr   r   r,   r   )kernel_sizestridedim)rK   r9   	enumeratesplitviewpermute	unsqueezer!   r   
functionalunfoldrJ   tappendcatrM   )r%   rN   rO   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r(   r6   zMistral3PatchMerger.forwardK   sl   cn
U_Z]doo-z!}/OP
 
 /::daAE::  $)2>3G3GHX3Y)Z 	)%K{+DAq%**1a3;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4(	) ?:++N;)
 ;s
   +E"E')
r<   r=   r>   __doc__r   r   r!   r@   r6   rA   rB   s   @r(   rD   rD   =   s?    j~ jell  RWR^R^ r)   rD   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Mistral3MultiModalProjectorrE   c                    t         |           t        |j                  j                  |j
                  j                        | _        t        |      | _	        t        |j                  t              rdnt        |j                        | _        t        j                   |j                  j                  | j                  z  |j
                  j                  |j"                        | _        t&        |j(                     | _        t        j                   |j
                  j                  |j
                  j                  |j"                        | _        y )N)r   r   rG   )r   r   r   rI   r&   text_configrms_norm_epsnormrD   patch_merger
isinstancevision_feature_layerintlennum_feature_layersr   rL   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2r%   rE   r'   s     r(   r   z$Mistral3MultiModalProjector.__init__d   s    #F$8$8$D$D&J\J\JiJij	/7 F77=A3vGbGbCc 	 		  ,,t/F/FF**11

 &556		**F,>,>,J,JQWQqQq
r)   rN   rO   c                     | j                  |      }| j                  ||      }| j                  |      }| j                  |      }| j	                  |      }|S N)ro   rp   rw   ry   rz   )r%   rN   rO   r*   s       r(   r6   z#Mistral3MultiModalProjector.forwardv   sR    >2**>;Gn5/m4r)   )	r<   r=   r>   r   r   r!   r@   r6   rA   rB   s   @r(   rk   rk   c   s*    
~ 
$ell  r)   rk   zT
    Base class for Mistral3 causal language model (or autoregressive) outputs.
    custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Mistral3CausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valuesr*   
attentionsimage_hidden_states)r<   r=   r>   ri   r   r!   FloatTensor__annotations__r   r   r   r*   r8   r   r    r)   r(   r   r      s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r)   r   zM
    Base class for Mistral3 outputs, with hidden states and attentions.
    c                   :    e Zd ZU dZdZej                  dz  ed<   y)Mistral3ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr   )r<   r=   r>   ri   r   r!   r   r   r   r)   r(   r   r      s    	 59**T18r)   r   c                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZy)Mistral3PreTrainedModelrE   model)imagetextTr   N)r<   r=   r>   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr   r)   r(   r   r      s=    (&*#"3N!"&r)   r   zx
    The Mistral3 model which consists of a vision backbone and a language model, without a language modeling head.
    c            "           e Zd ZddiZdef fdZd Zd Zee	 e
d      	 	 dd
ej                  dej                  deee   z  d	z  ded	z  dee   deez  fd                     Zdej,                  dej                  dej                  fdZee
	 	 	 	 	 	 	 	 	 	 	 	 	 ddej,                  d	z  d
ej                  d	z  dej                  d	z  dej,                  d	z  ded	z  dej                  d	z  deee   z  d	z  ded	z  ded	z  ded	z  ded	z  dej,                  d	z  dej                  d	z  dee   deez  fd              Z xZS )Mistral3Model^language_model.modellanguage_modelrE   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y r}   )r   r   r   from_configrI   vision_towerrk   multi_modal_projectorrm   r   	post_initr{   s     r(   r   zMistral3Model.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr)   c                 6    | j                   j                         S r}   )r   get_input_embeddingsr:   s    r(   r   z"Mistral3Model.get_input_embeddings   s    ""7799r)   c                 :    | j                   j                  |       y r}   )r   set_input_embeddingsr%   values     r(   r   z"Mistral3Model.set_input_embeddings   s    007r)   zWObtains image last hidden states from the vision tower and apply multimodal projection.r~   Npixel_valuesrO   rr   output_hidden_stateskwargsr   c                    |j                         D ci c]  \  }}|	|| }}} | j                  |f|ddd|}t        |t              r|j                  |   }	n3|D 
cg c]  }
|j                  |
    }}
t        j                  |d      }	| j                  |	j                  d      |      }| j                  j                  | j                  j                  z  }t        j                  ||j                        |z  j                  d      j                         }t        j                   |j                  d      |      }||_        |S c c}}w c c}
w )NT)rO   r   return_dictr,   rS   r   )device)itemsr   rq   rs   r*   r!   r^   r   squeezerK   rE   rJ   	as_tensorr   prodtolistrV   pooler_output)r%   r   rO   rr   r   r   kvimage_outputsselected_image_feature	layer_idxhs_poolrN   downsample_ratiosplit_sizess                  r(   get_image_featuresz Mistral3Model.get_image_features   sS    $*<<>C41aQ]!Q$CC)))
#!%	

 
 *C0%2%@%@AU%V"Ocd)}229=dGd%*YYwB%?"334J4R4RST4UWbc,,77$++:X:XX__[1F1FGK[[aafhaippr 	 ^%;%;A%>L&4#3 D es   
EE"E	input_idsinputs_embedsrN   c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r.   r   r,   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r!   tensorrE   image_token_idlongr   allsumr9   rY   	expand_asr/   r   numel)r%   r   r   rN   special_image_maskn_image_tokensn_image_featuress          r(   get_placeholder_maskz"Mistral3Model.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r)   attention_maskposition_idsr   	use_cacheoutput_attentionsr   cache_positionc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }|d u |d uz  rt	        d      | | j                         |      }|| j                  |||d      j                  }t        j                  |d      j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d	||||||	|
d|d	|}t!        |j"                  |j$                  |j&                  |j(                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsT)r   rr   rO   r   r   rS   )r   rN   )	r   r   r   r   r   r   r   r   r   )last_hidden_stater   r*   r   r   r   )rE   r   r   use_return_dict
ValueErrorr   r   r   r!   r^   r/   r   r.   r   masked_scatterr   r   r   r   r*   r   )r%   r   r   r   r   r   r   rr   r   r   r   r   r   rO   r   rN   r   outputss                     r(   r6   zMistral3Model.forward  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ 7D557	BM#!44)%9' 	 5 
 m  #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r)   )NN)NNNNNNNNNNNNN)r<   r=   r>   _checkpoint_conversion_mappingr   r   r   r   r   r   r   r!   r   r@   rs   listboolr   r   r8   r   r   
LongTensorr   r   r   r6   rA   rB   s   @r(   r   r      s=    	!"2&"~ :8 n 8<,0!''! \\! "DIo4	!
 #Tk! +,! 
+	+!   
!F"))":?:K:K"]b]n]n"0  .215.204(,267;!%)-,0#'26+/=
##d*=
 ''$.=
 t+	=

 &&-=
 =
 ((4/=
 "DIo4=
 $;=
  $;=
 #Tk=
 D[=
 ((4/=
 \\D(=
 +,=
  
,	,!=
  =
r)   r   zV
    The MISTRAL3 model which consists of a vision backbone and a language model.
    c            $       ~    e Zd ZdddddZddiZdef fd	Zd
 Zd Zde	j                  fdZe	 d"dej                  dej                  deee   z  dz  dee   deez  f
d       Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dej0                  dz  dej                  dz  dej                  dz  dej0                  dz  dedz  dej                  dz  dej0                  dz  dedz  dedz  dedz  dedz  dej0                  dz  deej                  z  dej                  dz  dee   deez  f d               Z	 	 	 	 	 	 	 d$ fd!	Z xZS )% Mistral3ForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)r   z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightrE   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFrG   )r   r   r   r   r   rL   rm   r&   
vocab_sizer   r   r{   s     r(   r   z)Mistral3ForConditionalGeneration.__init__j  sS     "6*
yy!3!3!?!?ASASA^A^ejkr)   c                 6    | j                   j                         S r}   )r   r   r:   s    r(   r   z5Mistral3ForConditionalGeneration.get_input_embeddingsp  s    zz..00r)   c                 :    | j                   j                  |       y r}   )r   r   r   s     r(   r   z5Mistral3ForConditionalGeneration.set_input_embeddingss  s    

''.r)   r   c                     | j                   S r}   )r   r:   s    r(   get_output_embeddingsz6Mistral3ForConditionalGeneration.get_output_embeddingsv  s    ||r)   Nr   rO   rr   r   c                 B     | j                   j                  d|||d|S )N)r   rO   rr   r   )r   r   )r%   r   rO   rr   r   s        r(   r   z3Mistral3ForConditionalGeneration.get_image_featuresy  s5     -tzz,, 
%#!5
 	
 	
r)   r   r   r   r   r   labelsr   r   r   r   r   logits_to_keepc                 <   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  } | j                  d||||||||	|
d||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                         S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```NT)r   r   r   r   r   r   r   r   r   r   r   rO   r   )r   r   r   )r   r   r   r*   r   r   r   )rE   r   r   r   r   rq   rs   slicer   loss_functionrm   r   r   r   r*   r   r   )r%   r   r   r   r   r   r   r   r   r   r   r   r   r   rO   r   r   r*   slice_indicesr   r   s                        r(   r6   z(Mistral3ForConditionalGeneration.forward  sP   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%)%+'/!5)#
 
   
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r)   c	           
      h    t        |   |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r   r   r   r   r   is_first_iterationr   Tr   )r   prepare_inputs_for_generationget)r%   r   r   r   r   r   r   r   r   r   model_inputsr'   s              r(   r   z>Mistral3ForConditionalGeneration.prepare_inputs_for_generation  s\     w<	
+')))1	
 	
 VZZT%B
 ,8L(r)   r}   )NNNNNNNNNNNNr   N)NNNNNNF)r<   r=   r>   r   _tied_weights_keysr   r   r   r   r   Moduler   r   r!   r   r@   rs   r   r   r   r8   r   r   r   r   r   r   r   r6   r   rA   rB   s   @r(   r   r   \  sP    #9.#@$-	&" +,VW~ 1/ryy  
 8<	
''
 \\
 "DIo4	

 +,
 
+	+
 
  .215.204(,26*.!%)-,0#'26-.+/W
##d*W
 ''$.W
 t+	W

 &&-W
 W
 ((4/W
   4'W
 $;W
  $;W
 #TkW
 D[W
 ((4/W
 ell*W
 \\D(W
  +,!W
" 
/	/#W
  W
x     r)   r   )r   r   r   ))dataclassesr   r!   r   activationsr   cache_utilsr   
generationr   integrationsr	   modeling_outputsr
   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   autor   configuration_mistral3r   r   r   rD   rk   r   r   r   r   r   __all__r   r)   r(   <module>r      sS  , "   !   ) 7 ` ` - & O O I  2 Y'Jbii J (J(#")) #L")) 8 
9[ 9 90 
9"9 9 9 'o ' ' 
R
+ R

R
j 
b'> b
bJ [r)   