
    ꬜iG                     h   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  ejZ                  e.      Z/ G d de&      Z0 G d de"      Z1 G d de*      Z2 G d dejf                        Z4 G d d ejj                        Z6 G d! d"ejf                        Z7 G d# d$e$      Z8 G d% d&e      Z9 G d' d(e      Z:e G d) d*e:             Z; G d+ d,e%e:e      Z<g d-Z=y).zPyTorch AFMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)MoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )GptOssRMSNorm)LlamaAttentionLlamaForCausalLMLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Qwen2MoeMLP   )AfmoeConfigc                       e Zd Zy)AfmoeRotaryEmbeddingN__name__
__module____qualname__     c/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/afmoe/modular_afmoe.pyr!   r!   /       r'   r!   c                       e Zd Zy)AfmoeRMSNormNr"   r&   r'   r(   r+   r+   3   r)   r'   r+   c                       e Zd Zy)AfmoeMLPNr"   r&   r'   r(   r-   r-   7   r)   r'   r-   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )AfmoeTokenChoiceRouterz
    Token-choice top-K router for MoE routing.

    This router assigns each token to the top-K experts based on sigmoid scores, matching the released checkpoints.
    c                     t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  |j
                  d      | _
        y NFbias)super__init__confignum_experts_per_toktop_knum_expertsroute_scaler   Linearhidden_sizegateselfr6   	__class__s     r(   r5   zAfmoeTokenChoiceRouter.__init__B   s^    //
!--!--IIf00&2D2D5Q	r'   hidden_statesexpert_biasc                    |j                   \  }}}|j                  d|      }t        j                  | j	                  |      j                  t        j                              }t        j                  ||z   | j                  d      \  }}|j                  d|      }|j                  dd      dz   }||z  }|| j                  z  }||fS )Nr   )kdim)rF   indexT)rF   keepdimg#B;)shapeviewtorchsigmoidr=   tofloat32topkr8   gathersumr:   )	r?   rA   rB   _
hidden_dimscoresselected_experts
top_scoresdenominators	            r(   forwardzAfmoeTokenChoiceRouter.forwardJ   s    (..1j%**2z:tyy7::5==IJ#jj+)=QRS]]q0@]A
 nnTn:UB+-
$"2"22
+++r'   )	r#   r$   r%   __doc__r5   rK   TensorrX   __classcell__r@   s   @r(   r/   r/   ;   s)    R,U\\ , ,r'   r/   c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	AfmoeExpertsz
    Container holding the routed experts.

    This mirrors the Experts pattern used across other MoE models to ease checkpoint conversion.
    r6   c                     t         |           |j                  | _        |j                  | _        t        | j                        D ](  }| j                  t        ||j                               * y )N)intermediate_size)	r4   r5   r7   r8   r9   rangeappendr-   moe_intermediate_size)r?   r6   rR   r@   s      r(   r5   zAfmoeExperts.__init___   s^    //
!--t''( 	ZAKK6;W;WXY	Zr'   rA   rU   routing_weightsreturnc                 0   |j                   \  }}}|dk(  r|j                  |d|      S |j                  d|      }|j                   d   }t        j                  |j                   d   |j
                  t        j                        j                  |      }	|j                  d      }
|j                  d      }t        j                  |
d      }|	|   }	|
|   }
||   }|j                  d|	      }t        j                  |      }t        j                  |
d      \  }}d}t        |j                         |j                               D ]'  \  }}|dk(  r||z   }||| } | |   |      }|||| |}) |j                  t        j                         |j#                  d      z  j                  |j$                        }t        j                  |      }|	j#                  d      j'                  |      }|j)                  d||       |j                  |||      S )z
        Args:
            hidden_states: (batch, seq, hidden)
            selected_experts: (batch, seq, top_k)
            routing_weights: (batch, seq, top_k)
        r   rD   )devicedtypeT)stable)return_counts)rI   	new_zerosrJ   rK   arangerg   longrepeat_interleavereshapeargsortindex_select
zeros_likeunique_consecutiveziptolistrM   rN   	unsqueezerh   	expand_asscatter_add_)r?   rA   rU   rd   
batch_sizeseq_lenrS   hidden_states_flatr8   token_indicesexpert_indicessortingdispatched_tokensexpert_outputsunique_expertscountsstart	expert_idcountendexpert_inputexpert_outputweighted_outputs
aggregatedscatter_indicess                            r(   rX   zAfmoeExperts.forwardf   s    +8*=*='
GZa< **:q*EE*//J? &&r* $$Q'0D0DEJJ


E
" 	 *11"5)11"5--t<%g.'0)'2.;;A}M))*;<!&!9!9.X\!] #N$9$9$;V]]_ M 	Iuz%-C,U37L+DOL9M(5N5%E	 +--emm<?X?XY[?\\``anatatu%%&89
'11"5??@PQ?4DEz7J??r'   )
r#   r$   r%   rY   r   r5   rK   rZ   rX   r[   r\   s   @r(   r^   r^   X   sR    Z{ Z-@"\\-@=B\\-@\a\h\h-@	-@r'   r^   c                   (     e Zd ZdZ fdZd Z xZS )AfmoeMoEz
    Mixture of Experts (MoE) module for AFMoE.

    This module implements a sparse MoE layer with both shared experts (always active) and
    routed experts (activated based on token-choice routing).
    c                 2   t         |           || _        t        |      | _        t        ||j                  |j                  z        | _        t        |      | _
        t        j                  t        j                  |j                        d      | _        y )NF)requires_grad)r4   r5   r6   r/   routerr-   rc   num_shared_expertsshared_expertsr^   expertsr   	ParameterrK   zerosr9   rB   r>   s     r(   r5   zAfmoeMoE.__init__   sp    ,V4&vv/K/KfNgNg/gh#F+<<F4F4F(GW\]r'   c                    |j                   \  }}}|j                  d|      }| j                  || j                        \  }}|j                  ||| j                  j
                        }|j                  ||| j                  j
                        }| j                  |      j                  |||      }| j                  |||      }	||	z   S )NrD   )rI   rJ   r   rB   r6   r7   r   r   )
r?   rA   ry   rz   rS   r{   rV   rU   shared_outputrouted_outputs
             r(   rX   zAfmoeMoE.forward   s    *7*=*='
GZ*//J? (,{{=$BRBR'S$
$__Z$++:Y:YZ
+00WdkkFeFef ++,>?DDZQXZde]4DjQ},,r'   )r#   r$   r%   rY   r5   rX   r[   r\   s   @r(   r   r      s    ^-r'   r   c                       e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  f   fdZ xZS )AfmoeAttentionaJ  
    Multi-headed attention module with optional sliding window and gating.

    This attention mechanism supports both full attention and sliding window attention,
    and includes Q/K normalization and gating of the output. It inherits from [`LlamaAttention`] to minimize the amount
    of custom logic we need to maintain.
    r6   	layer_idxc                    t         |   ||       |j                  |   dk(  | _        | j                  r|j                  nd | _        t        | j                  |j                        | _        t        | j                  |j                        | _	        t        j                  |j                  |j                  | j                  z  d      | _        y )Nsliding_attentionepsFr2   )r4   r5   layer_typesis_local_attentionsliding_windowr+   head_dimrms_norm_epsq_normk_normr   r;   r<   num_attention_heads	gate_projr?   r6   r   r@   s      r(   r5   zAfmoeAttention.__init__   s    + #)"4"4Y"?CV"V7;7N7Nf33TX"4==f6I6IJ"4==f6I6IJ6#5#5v7Q7QTXTaTa7ahmnr'   NrA   position_embeddingsattention_maskpast_key_valuecache_positionkwargsre   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      }	| j	                  |      j                  |      }
| j                  |      j                  |      }| j                  |      }| j                  |	      j                  dd      }	| j                  |
      j                  dd      }
|j                  dd      }| j                  r|\  }}t        |	|
||      \  }	}
|%d|i}|j                  |
|| j                  |      \  }
}t        j                  | j                   j"                  t$              } || |	|
|f|| j&                  sdn| j(                  | j*                  | j,                  d|\  }} |j                  g |d j/                         }|t1        j2                  |      z  }| j5                  |      }||fS )NrD   r   r   r   g        )r   dropoutscalingr   )rI   r   q_projrJ   k_projv_projr   r   	transposer   r   r   updater   r   get_interfacer6   _attn_implementationr   trainingattention_dropoutr   r   
contiguousrK   rL   o_proj)r?   rA   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesgate_statescossincache_kwargsattention_interfaceoutputattn_weightsattn_outputs                       r(   rX   zAfmoeAttention.forward   s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|Dnn]3{{<0::1a@[[,66q!<
#--a3""*HC';L*VY[^'_$L*%,n=L'5'<'<ZW[WeWegs't$J(?(M(MKK,,.E)
  3	
 

 *#}}C$2H2HLL..
 
 
 
 .k.2.99;%--44kk&)L((r'   )NN)r#   r$   r%   rY   r   intr5   rK   rZ   tupler   
LongTensorr   r   rX   r[   r\   s   @r(   r   r      s    	o{ 	os 	o  (,260)||0) #5<<#=>0) t+	0)
 0) ((4/0) +,0) 
u||U\\)	*0)r'   r   c                   &    e Zd ZdZdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )AfmoeDecoderLayerz
    AFMoE decoder layer with dual normalization.

    This layer applies self-attention followed by either a dense MLP or MoE block,
    with dual normalization (pre and post) around each component.
    r6   r   c                 P   t         |           |j                  | _        || _        t	        ||      | _        |j                  |   | _        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        ||j                  k\  | _        | j                  rt!        |      | _        y t%        |      | _        y )N)r6   r   r   )r4   r5   r<   r   r   	self_attnr   attention_typer+   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernormnum_dense_layersmoe_enabledr   mlpr-   r   s      r(   r5   zAfmoeDecoderLayer.__init__  s    !--"'vK$00;  ,F,>,>FDWDWX(4V5G5GVM`M`(a% ".f.@.@fFYFY!Z".v/A/AvGZGZ"[ %(?(??'DH'DHr'   NrA   r   position_idsr   	use_cacher   r   r   re   c                    |}	| j                  |      } | j                  d|||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j	                  |      }| j                  |      }|	|z   }|S )N)rA   r   r   r   r   r   r   r&   )r   r   r   r   r   r   )r?   rA   r   r   r   r   r   r   r   residualrR   s              r(   rX   zAfmoeDecoderLayer.forward  s     ! ,,];)4>> 	
')%)) 3	
 	
q 55mD =0 !..}=///> =0r'   )NNNNNN)r#   r$   r%   rY   r   r   r5   rK   rZ   r   r   boolr   r   r   FloatTensorrX   r[   r\   s   @r(   r   r      s    ({ (s (4 /304'+!%26HL#||# t+# &&-	#
 # $;# ((4/# #5<<#=>E# +,# 
		#r'   r   c                   d     e Zd ZU dZeed<   dZdgZdgZe	e
dZg dZdZdZdZdZdZ fd	Z xZS )
AfmoePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r6   modelr   past_key_values)rA   
attentions)r   r   r   r   r   r   normrB   Tc                     t         |   |       t        |t              r*t	        j
                  |j                  j                         yt        |t              r t	        j
                  |j                         yy)zInitialize the weightsN)
r4   _init_weights
isinstancer/   initzeros_r=   weightr   rB   )r?   moduler@   s     r(   r   z"AfmoePreTrainedModel._init_weights_  sR    f%f45KK**+)KK**+ *r'   )r#   r$   r%   rY   r   __annotations__base_model_prefix_no_split_modules_skip_keys_device_placementr   r   _can_record_outputs_keep_in_fp32_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendsupports_gradient_checkpointingr   r[   r\   s   @r(   r   r   A  sg    
 ,-#4"5*$	 N"&&*#, ,r'   r   c                       e Zd ZdZdef fdZeee	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  d	edz  d
e	j                  dz  dedz  dee   deez  fd                     Z xZS )
AfmoeModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AfmoeDecoderLayer`]

    Args:
        config: AfmoeConfig
    r6   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   r6   F)r4   r5   pad_token_idpadding_idx
vocab_sizer   	Embeddingr<   embed_tokens
ModuleListra   num_hidden_layersr   layersr+   r   r   r!   
rotary_embgradient_checkpointing	post_initr   s      r(   r5   zAfmoeModel.__init__q  s     !.. ++LL):):F<N<NPTP`P`ammCHIaIaCbcivy1c
 !!3!39L9LM	.f=&+# ds   DN	input_idsr   inputs_embedsr   r   r   r   r   re   c                    |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s)| j                  ||||d}t        di |t        di |d}
|}| j                  j                  r|| j                  j                  dz  z  }| j!                  ||      }| j"                  D ]  } ||f|
|j$                     |||||d	|}! | j'                  |      }t)        ||r|
      S d 
      S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )rg   )r6   r  r   r   r   )full_attentionr   g      ?)r   r   r   r   r   r   )last_hidden_stater   r&   )
ValueErrorr   r6   r   get_seq_lengthrK   rl   rI   rg   rv   r   dictr
   r   mup_enabledr<   r  r  r   r   r   )r?   r  r   r  r   r   r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsrA   r   decoder_layers                  r(   rX   zAfmoeModel.forward  s    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\  =#6#6q#99$++N
 )33A6L ?-F++!."0"0#2K #5"C{"C%F%U%U#
 & ;;"")T[[-D-Dc-IJM"oom\J![[ 
	M)	2=3O3OP).#-$7	 	M
	 		-0%+/8O
 	
>B
 	
r'   )NNNNNNN)r#   r$   r%   rY   r   r5   r   r   r   rK   r   rZ   r   r   r   r   r   r   r   rX   r[   r\   s   @r(   r   r   h  s    {   .2.22604(,26!%D
##d*D
 t+D
 ((4/	D

 &&-D
 D
 ((4/D
 $;D
 +,D
 
'	'D
    D
r'   r   c                       e Zd Zd Zy)AfmoeForCausalLMc                     t         j                  | |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r1   )
r   r5   r   r   r   r   r;   r<   lm_headr  )r?   r6   s     r(   r5   zAfmoeForCausalLM.__init__  sU    %%dF3'
 ++yy!3!3V5F5FUSr'   N)r#   r$   r%   r5   r&   r'   r(   r  r    s    r'   r  )r  r   r   )>rY   collections.abcr   rK   r    r   r   cache_utilsr   r   
generationr	   masking_utilsr
   r   modeling_layersr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   gpt_oss.modeling_gpt_ossr   llama.modeling_llamar   r   r   r   r   qwen2_moe.modeling_qwen2_moer   configuration_afmoer   
get_loggerr#   loggerr!   r+   r-   Moduler/   r   r^   r   r   r   r   r   r  __all__r&   r'   r(   <module>r+     s    $   & . ) R 9 6 F & @ @ 7 5 4  7 , 
		H	%	/ 		= 		{ 	,RYY ,:;@2== ;@|-ryy ->D)^ D)NB2 BJ$,? $,N ^
% ^
 ^
B')= r'   