
    it-                     H   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-  ej\                  e/      Z0 G d de"      Z1 G d de#      Z2 G d de      Z3 G d de       Z4 G d de'      Z5 G d  d!e+      Z6 G d" d#ejn                        Z8 G d$ d%e!      Z9e G d& d'e             Z:e G d( d)e)             Z; G d* d+e(e      Z<g d,Z=y)-zPyTorch OLMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask)MoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_grouped_mm_availablelogging)OutputRecorder   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)MixtralExpertsMixtralForCausalLMMixtralModel)Qwen2MoeTopKRouter   )OlmoeConfigc                         e Zd Zd fd	Z xZS )OlmoeRMSNormc                 &    t         |   ||       y N)super__init__)selfhidden_sizeeps	__class__s      c/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/olmoe/modular_olmoe.pyr'   zOlmoeRMSNorm.__init__.   s    c*    )gh㈵>)__name__
__module____qualname__r'   __classcell__r+   s   @r,   r#   r#   -   s    + +r-   r#   c                       e Zd Zy)OlmoeRotaryEmbeddingNr.   r/   r0    r-   r,   r4   r4   2       r-   r4   c                       e Zd Zy)OlmoeMLPNr5   r6   r-   r,   r9   r9   6   r7   r-   r9   c                   :    e Zd Zddededz  f fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )OlmoeAttentionNconfig	layer_idxc                     t         |   ||       t        |j                  |j                        | _        t        |j                  |j                  z  |j                  z  |j                        | _        y )Nr*   )	r&   r'   r#   r)   rms_norm_epsq_normnum_attention_headsnum_key_value_headsk_normr(   r<   r=   r+   s      r,   r'   zOlmoeAttention.__init__;   s`    +"6#5#56;N;NO"6#=#==A[A[[agatat
r-   hidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc           
         |j                   d d }g |d| j                  }| j                  | j                  |            }	| j	                  | j                  |            }
| j                  |      }| j                  j                  |	j                  | j                  j                   | j                  j                         |
j                  | j                  j                   | j                  j                         |j                  | j                  j                   | j                  j                          |	j                  | j                  dd      }	 |
j                  | j                  dd      }
 |j                  | j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                   | j                  j"                  t$              } || |	|
||f| j&                  sdn| j(                  | j*                  t-        | j                  dd       d|\  }} |j.                  g |d j1                         }| j3                  |      }||fS )	N)minmaxr    r   )sincosrJ           sliding_window)dropoutscalingrT   )shapehead_dimrA   q_projrD   k_projv_projr<   clip_qkvclamp_view	transposer   updater=   r   get_interface_attn_implementationr   trainingattention_dropoutrV   getattrreshape
contiguouso_proj)r(   rF   rG   rH   rI   rJ   rK   input_shapehidden_shapequery_states
key_statesvalue_statesrR   rQ   cache_kwargsattention_interfaceattn_outputattn_weightss                     r,   forwardzOlmoeAttention.forwardB   sa    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST(|((,7AA!QG$Z__l3==aC
(|((,7AA!QG&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r-   r%   )NN)r.   r/   r0   r!   intr'   torchTensortupler   
LongTensorr   r   rr   r1   r2   s   @r,   r;   r;   :   s    
{ 
sTz 
 )-262)||2) #5<<#=>2) t+	2)
 2) ((4/2) +,2) 
u||U\\D0%2E2LL	M2)r-   r;   c                       e Zd Zy)OlmoeExpertsNr5   r6   r-   r,   ry   ry   w   r7   r-   ry   c                       e Zd Zy)OlmoeTopKRouterNr5   r6   r-   r,   r{   r{   {   r7   r-   r{   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )OlmoeSparseMoeBlockc                 b    t         |           t        |      | _        t	        |      | _        y r%   )r&   r'   r{   gatery   expertsr(   r<   r+   s     r,   r'   zOlmoeSparseMoeBlock.__init__   s&    #F+	#F+r-   rF   rL   c                     |j                   \  }}}|j                  d|      }| j                  |      \  }}}| j                  |||      j	                  |||      }|S )NrN   )rW   r^   r   r   rf   )	r(   rF   
batch_sizesequence_length
hidden_dim_top_k_weightstop_k_indexfinal_hidden_statess	            r,   rr   zOlmoeSparseMoeBlock.forward   sh    2?2E2E/
OZ%**2z:(,		-(@%=+"ll=+}U]]
 #"r-   )r.   r/   r0   r'   rt   ru   rr   r1   r2   s   @r,   r}   r}      s#    ,
#U\\ #ell #r-   r}   c                   (     e Zd Zdedef fdZ xZS )OlmoeDecoderLayerr<   r=   c                 $   t         |   ||       |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )N)r<   r=   r?   )r&   r'   r)   r;   	self_attnr}   mlpr#   r@   input_layernormpost_attention_layernormrE   s      r,   r'   zOlmoeDecoderLayer.__init__   sp    +!--'vK&v.+F,>,>FDWDWX(4V5G5GVM`M`(a%r-   )r.   r/   r0   r!   rs   r'   r1   r2   s   @r,   r   r      s    b{ bs b br-   r   c                       e Zd ZU eed<   dZdZdgZdgZdZ	dZ
 eed      eedZ e       ZdZ ej(                         d	        Zy
)OlmoePreTrainedModelr<   modelTr   rI   r   )index)router_logitsrF   
attentionsc                    t        j                  | |       t        |t              rmt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         y t        |t              r7t	        j
                  |j                  d| j                  j                         y y )NrS   )meanstd)r   _init_weights
isinstancery   initnormal_gate_up_projr<   initializer_range	down_projr{   weight)r(   modules     r,   r   z"OlmoePreTrainedModel._init_weights   s    %%dF3fl+LL,,3DKK<Y<YZLL))9V9VW0LLSdkk6S6ST 1r-   N)r.   r/   r0   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   r{   r   r;   _can_record_outputsr   _can_compile_fullgraph_supports_attention_backendrt   no_gradr   r6   r-   r,   r   r      sz    &*#,-#4"5N'qA*$ 	 !  #'U]]_U Ur-   r   c                        e Zd Zdef fdZ	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
ej                  dz  dee   defdZ xZS )
OlmoeModelr<   c           	         t         |   |       t        j                  |j                  |j
                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j
                  |j                        | _        t!        |      | _        y c c}w )Nr?   r<   )r&   r'   r   	Embedding
vocab_sizer)   padding_idxembed_tokens
ModuleListrangenum_hidden_layersr   layersr#   r@   normr4   
rotary_embrE   s      r,   r'   zOlmoeModel.__init__   s     LL):):F<N<NPTP`P`ammCHIaIaCbcivy1c
 !!3!39L9LM	.f= ds   1CN	input_idsrH   position_idsrI   inputs_embeds	use_cacherJ   rK   rL   c                 D   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r    )device)r<   r   rH   rJ   rI   r   )rG   rH   r   rI   r   rJ   )last_hidden_staterI   )
ValueErrorr   r<   r   get_seq_lengthrt   arangerW   r   	unsqueezer
   r   r   r   r   r   )r(   r   rH   r   rI   r   r   rJ   rK   past_seen_tokenscausal_maskrF   rG   decoder_layers                 r,   rr   zOlmoeModel.forward   s`    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7*) /#-	 	M
	 		-0%++
 	
r-   )NNNNNNN)r.   r/   r0   r!   r'   rt   rw   ru   r   FloatTensorboolr   r   r   rr   r1   r2   s   @r,   r   r      s    >{ > .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
r-   r   c                   0     e Zd ZddiZ fdZ fdZ xZS )OlmoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                 f    t         |   |       t        |      | _        |j                  | _        y r%   )r&   r'   r   r   num_expertsr   s     r,   r'   zOlmoeForCausalLM.__init__  s*     '
!--r-   c                 "    t        |   di |S )u  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OlmoeForCausalLM

        >>> model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")
        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
        ```
        r6   )r&   rr   )r(   super_kwargsr+   s     r,   rr   zOlmoeForCausalLM.forward  s    0 w...r-   )r.   r/   r0   _tied_weights_keysr'   rr   r1   r2   s   @r,   r   r      s    *,GH.
/ /r-   r   )r   r   r   )>__doc__collections.abcr   rt   r    r   r   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.output_capturingr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   mixtral.modeling_mixtralr   r   r   qwen2_moe.modeling_qwen2_moer   configuration_olmoer!   
get_loggerr.   loggerr#   r4   r9   r;   ry   r{   Moduler}   r   r   r   r   __all__r6   r-   r,   <module>r      s     $   & . ) / 6 F & Y Y 4 +  X W = , 
		H	%+< +
	/ 		x 	:)^ :)z	> 		( 	#")) # b) b U? U U8 E
 E
 E
P /)?  /F Er-   