
    i                        d dl mZ d dlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ  ej4                  e      Zdej:                  dededej:                  fdZ  G d de      Z! G d de      Z"e G d de             Z#e G d de             Z$e G d de             Z% G d de
e#      Z& G d de	e#      Z' G d d ee#      Z(g d!Z)y)"    )CallableN   )Cache)FlashAttentionKwargs)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassification)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging   )MistralAttentionMistralDecoderLayerMistralForCausalLMMistralModelMistralPreTrainedModelapply_rotary_pos_embeager_attention_forwardpositions_idsbetamax_position_embeddingsreturnc           	          d|t        j                  dt        j                  | |z        z         z  z   }|j                  d      S )N   )torchlogfloor	unsqueeze)r   r   r   scalings       m/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/ministral3/modular_ministral3.py_get_llama_4_attn_scaler#      s?    $1u{{=CZ3Z'[#[\\\GR      c                       e Zd Z	 	 d
dej                  deej                  ej                  f   dej                  dz  dedz  dej                  dz  dee	   deej                  ej                  dz  f   fd	Z
y)Ministral3AttentionNhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsr   c           
      "   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|	t        || j                  j                  j                  d      | j                  j                  j                  d            j                  |	j                        z  }	|'|||d}|j                  |
|| j                  |      \  }
}t!        j"                  | j                  j$                  t&              } || |	|
||f| j(                  sdn| j*                  | j,                  t/        | j                  dd       d	|\  }} |j0                  g |d j3                         }| j5                  |      }||fS )
Nr   r   r   llama_4_scaling_beta original_max_position_embeddings)sincosr+   g        sliding_window)dropoutr!   r2   )shapehead_dimq_projview	transposek_projv_projr   r#   configrope_parametersgettodtypeupdate	layer_idxr
   get_interface_attn_implementationr   trainingattention_dropoutr!   getattrreshape
contiguouso_proj)selfr'   r(   r)   r*   r+   r,   input_shapehidden_shapequery_states
key_statesvalue_statesr1   r0   cache_kwargsattention_interfaceattn_outputattn_weightss                     r"   forwardzMinistral3Attention.forward#   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j#&=KK''++,BCKK''++,NO'
 "\
 	! &#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r$   )NN)__name__
__module____qualname__r   Tensortupler   
LongTensorr   r   rT    r$   r"   r&   r&   "   s     )-26/)||/) #5<<#=>/) t+	/)
 /) ((4//) -./) 
u||U\\D00	1/)r$   r&   c                       e Zd Zy)Ministral3DecoderLayerNrU   rV   rW   r[   r$   r"   r]   r]   U       r$   r]   c                       e Zd Zy)Ministral3PreTrainedModelNr^   r[   r$   r"   ra   ra   Y       r$   ra   c                       e Zd Zy)Ministral3ModelNr^   r[   r$   r"   rd   rd   ^   rb   r$   rd   c                       e Zd Zy)Ministral3ForCausalLMNr^   r[   r$   r"   rf   rf   c   rb   r$   rf   c                       e Zd Zy) Ministral3ForTokenClassificationNr^   r[   r$   r"   rh   rh   h   r_   r$   rh   c                       e Zd Zy)#Ministral3ForSequenceClassificationNr^   r[   r$   r"   rj   rj   l   r_   r$   rj   c                       e Zd Zy)Ministral3ForQuestionAnsweringNr^   r[   r$   r"   rl   rl   p   r_   r$   rl   )rf   rl   rd   ra   rj   rh   )*collections.abcr   r   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr   r   r	   modeling_utilsr
   processing_utilsr   utilsr   r   mistral.modeling_mistralr   r   r   r   r   r   r   
get_loggerrU   loggerrX   floatintr#   r&   r]   ra   rd   rf   rh   rj   rl   __all__r[   r$   r"   <module>rz      s   $    B 
 6 & ,   
		H	%!5<< !u !_b !glgsgs !
0)* 0)f	0 	 	 6 	 	 	l 	 	 	. 	 		'DF_ 		*JLe 		%@B[ 	r$   