
    i6                        d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ dd	lmZmZ d
dlmZmZmZmZmZmZ d
dlmZ ddlmZ  ej6                  e      Z G d de      Z G d dej>                        Z  G d dej>                        Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z&g dZ'y)     N)nn   )ACT2FN)Cache)BaseModelOutputWithPooling)Unpack)auto_docstringlogging)can_return_tuplemerge_with_config_defaults   )LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModelTransformersKwargs)MistralRMSNorm   )Mistral3Configc                       e Zd Zy)Mistral3RMSNormN__name__
__module____qualname__     i/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/mistral3/modular_mistral3.pyr   r   (       r   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                 "   t         |           || _        |j                  j                  }|j
                  | _        | j                  j                  j                  | _        t        j                  || j
                  dz  z  |d      | _	        y )Nr   Fbias)
super__init__r#   vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr#   r*   	__class__s      r   r(   zMistral3PatchMerger.__init__1   sr    **66"(";";++33>>YY{T5L5La5O'OQ\chir   image_featuresimage_sizesreturnc                    |D cg c]&  }|d   | j                   z  |d   | j                   z  f( }}|D cg c]
  \  }}||z   }}}|j                  d   }g }t        |j                  |            D ]  \  }	}
||	   \  }}|
j	                  |||      j                  ddd      j                  d      }t        j                  j                  j                  || j                  | j                        }|j	                  || j                  dz  z  d      j                         }|j                  |        t        j                  |d      }| j                  |      }|S c c}w c c}}w )Nr   r   r   )kernel_sizestridedim)r,   shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr+   tappendcatr.   )r/   r1   r2   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r   forwardzMistral3PatchMerger.forward:   sl   cn
U_Z]doo-z!}/OP
 
 /::daAE::  $)2>3G3GHX3Y)Z 	)%K{+DAq%**1a3;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4(	) ?:++N;)
 ;s
   +E"E')
r   r   r   __doc__r   r(   r@   TensorrP   __classcell__r0   s   @r   r"   r"   ,   s?    j~ jell  RWR^R^ r   r"   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Mistral3MultiModalProjectorr#   c                    t         |           t        |j                  j                  |j
                  j                        | _        t        |      | _	        t        |j                  t              rdnt        |j                        | _        t        j                   |j                  j                  | j                  z  |j
                  j                  |j"                        | _        t&        |j(                     | _        t        j                   |j
                  j                  |j
                  j                  |j"                        | _        y )N)epsr   r%   )r'   r(   r   r)   r*   text_configrms_norm_epsnormr"   patch_merger
isinstancevision_feature_layerintlennum_feature_layersr   r-   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)r/   r#   r0   s     r   r(   z$Mistral3MultiModalProjector.__init__S   s    #F$8$8$D$D&J\J\JiJij	/7 F77=A3vGbGbCc 	 		  ,,t/F/FF**11

 &556		**F,>,>,J,JQWQqQq
r   r1   r2   c                     | j                  |      }| j                  ||      }| j                  |      }| j                  |      }| j	                  |      }|S N)r[   r\   rc   re   rf   )r/   r1   r2   hidden_statess       r   rP   z#Mistral3MultiModalProjector.forwarde   sR    >2**>;Gn5/m4r   )	r   r   r   r   r(   r@   rR   rP   rS   rT   s   @r   rV   rV   R   s*    
~ 
$ell  r   rV   c                       e Zd Zy)Mistral3CausalLMOutputWithPastNr   r   r   r   rk   rk   n   r    r   rk   c                       e Zd Zy)Mistral3ModelOutputWithPastNr   r   r   r   rm   rm   r   r    r   rm   c                       e Zd Zy)Mistral3PreTrainedModelNr   r   r   r   ro   ro   v   r    r   ro   c            "          e Zd Zee ed      	 	 ddej                  dej                  de	e
e	   z  dz  dedz  dee   d	eez  fd
                     Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej"                  dz  dej                  dz  dej                  dz  dej"                  dz  dedz  dej                  dz  de	e
e	   z  dz  dedz  dedz  dedz  dedz  dej"                  dz  dej                  dz  dee   d	eez  fd              Zy)Mistral3ModelzWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introNpixel_valuesr2   r^   output_hidden_stateskwargsr3   c                    |j                         D ci c]  \  }}|	|| }}} | j                  |f|ddd|}t        |t              r|j                  |   }	n3|D 
cg c]  }
|j                  |
    }}
t        j                  |d      }	| j                  |	j                  d      |      }| j                  j                  | j                  j                  z  }t        j                  ||j                        |z  j                  d      j                         }t        j                   |j                  d      |      }||_        |S c c}}w c c}
w )NT)r2   rt   return_dictr5   r8   r   )device)itemsvision_towerr]   r_   ri   r@   rE   multi_modal_projectorsqueezer,   r#   r+   	as_tensorrx   prodtolistr<   pooler_output)r/   rs   r2   r^   rt   ru   kvimage_outputsselected_image_feature	layer_idxhs_poolr1   downsample_ratiosplit_sizess                  r   get_image_featuresz Mistral3Model.get_image_features{   sS    $*<<>C41aQ]!Q$CC)))
#!%	

 
 *C0%2%@%@AU%V"Ocd)}229=dGd%*YYwB%?"334J4R4RST4UWbc,,77$++:X:XX__[1F1FGK[[aafhaippr 	 ^%;%;A%>L&4#3 D es   
EE"E	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrw   cache_positionc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }|d u |d uz  rt	        d      | | j                         |      }|| j                  |||d      j                  }t        j                  |d      j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d	||||||	|
d|d	|}t!        |j"                  |j$                  |j&                  |j(                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rs   r^   r2   rw   r   r8   )r   r1   )	r   r   r   r   r   r   rt   rw   r   )last_hidden_stater   ri   
attentionsimage_hidden_statesr   )r#   r   rt   use_return_dict
ValueErrorget_input_embeddingsr   r   r@   rE   torx   dtypeget_placeholder_maskmasked_scatterlanguage_modelrm   r   r   ri   r   )r/   r   rs   r   r   r   r   r^   r   r   rt   rw   r   r2   ru   r1   special_image_maskoutputss                     r   rP   zMistral3Model.forward   s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ 7D557	BM#!44)%9' 	 5 
 m  #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r   )NN)NNNNNNNNNNNNN)r   r   r   r   r   r	   r@   FloatTensorrR   r_   listboolr   r   tupler   r   
LongTensorr   rm   rP   r   r   r   rq   rq   z   s   n 8<,0!''! \\! "DIo4	!
 #Tk! +,! 
+	+!   
!F  .215.204(,267;!%)-,0#'26+/=
##d*=
 ''$.=
 t+	=

 &&-=
 =
 ((4/=
 "DIo4=
 $;=
  $;=
 #Tk=
 D[=
 ((4/=
 \\D(=
 +,=
  
,	,!=
  =
r   rq   c            "          e Zd Ze	 ddej
                  dej                  deee   z  dz  de	e
   deez  f
d       Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej
                  dz  d	ej                  dz  d
ej                  dz  dedz  dej
                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  dej                  dz  de	e
   deez  f dZy) Mistral3ForConditionalGenerationNrs   r2   r^   ru   r3   c                 B     | j                   j                  d|||d|S )N)rs   r2   r^   r   )modelr   )r/   rs   r2   r^   ru   s        r   r   z3Mistral3ForConditionalGeneration.get_image_features   s5     -tzz,, 
%#!5
 	
 	
r   r   r   r   r   r   labelsr   r   rt   rw   r   logits_to_keepc                 <   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  } | j                  d||||||||	|
d||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                         S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```NT)r   rs   r   r   r   r   r   r   rt   rw   r   r2   r   )logitsr   
vocab_size)lossr   r   ri   r   r   r   )r#   r   rt   r   r   r]   r_   slicelm_headloss_functionrY   r   rk   r   ri   r   r   )r/   r   rs   r   r   r   r   r   r   r   rt   rw   r   r   r2   ru   r   ri   slice_indicesr   r   s                        r   rP   z(Mistral3ForConditionalGeneration.forward   sP   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%)%+'/!5)#
 
   
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r   rh   )NNNNNNNNNNNNr   N)r   r   r   r	   r@   r   rR   r_   r   r   r   r   r   r   r   r   r   rk   rP   r   r   r   r   r      s   
 8<	
''
 \\
 "DIo4	

 +,
 
+	+
 
  .215.204(,26*.!%)-,0#'26-.+/W
##d*W
 ''$.W
 t+	W

 &&-W
 W
 ((4/W
   4'W
 $;W
  $;W
 #TkW
 D[W
 ((4/W
 ell*W
 \\D(W
  +,!W
" 
/	/#W
r   r   )rq   ro   r   )(r@   r   activationsr   cache_utilsr   modeling_outputsr   processing_utilsr   utilsr	   r
   utils.genericr   r   llava.modeling_llavar   r   r   r   r   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler"   rV   rk   rm   ro   rq   r   __all__r   r   r   <module>r      s       !   : & , I  6 2 
		H	%	n 	#")) #L")) 8	%@ 		": 		2 	h
J h
Vg
'D g
Tr   