
    i_                        d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3  e"jh                  e5      Z6 G d de-      Z7 G d de.      Z8 G d dejr                        Z: G d dejr                        Z; G d  d!ejr                        Z< G d" d#ejr                        Z= G d$ d%ejr                        Z> G d& d'e)      Z?e  G d( d)e,             Z@e  G d* d+e+             ZA G d, d-e@e      ZB G d. d/ee@      ZCg d0ZDy)1zPyTorch JetMoe model.    )CallableN)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask) GenericForSequenceClassification)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )LlamaDecoderLayer)MixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralRotaryEmbeddingapply_rotary_pos_embeager_attention_forwardload_balancing_loss_func   )JetMoeConfigc                       e Zd Zy)JetMoeRMSNormN__name__
__module____qualname__     e/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/jetmoe/modular_jetmoe.pyr&   r&   4       r,   r&   c                       e Zd Zy)JetMoeRotaryEmbeddingNr'   r+   r,   r-   r0   r0   8   r.   r,   r0   c                   6     e Zd Zdedededdf fdZd Z xZS )JetMoeParallelExpertsnum_experts
input_sizeoutput_sizereturnNc                     t         |           t        j                  t	        j
                  |||            | _        || _        || _        || _	        y)a  
        Initialize the JetMoeParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
super__init__r   	Parametertorchemptyweightr3   r4   r5   )selfr3   r4   r5   	__class__s       r-   r9   zJetMoeParallelExperts.__init__=   sD    " 	ll5;;{K#TU&$&r,   c                     |j                  |d      }g }t        | j                        D ]7  }|j                  t	        j
                  ||   | j                  |                9 t        j                  |d      }|S )a  
        Forward pass of the JetMoeParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   dim)	splitranger3   appendFlinearr=   r;   cat)r>   inputsexpert_size
input_listoutput_listiresultss          r-   forwardzJetMoeParallelExperts.forwardT   sq     \\+1\5
t''( 	HAqxx
1t{{1~FG	H))KQ/r,   r(   r)   r*   intr9   rO   __classcell__r?   s   @r-   r2   r2   <   s)    'C 'S 's 't '.r,   r2   c                   2     e Zd Zdededef fdZd Z xZS )JetMoeTopKGatingr4   r3   top_kc                     t         |           || _        || _        || _        t        j                  ||d      | _        y)a  
        Initialize the top-k gating mechanism.

        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        FbiasN)r8   r9   r3   r4   rV   r   Linearlayer)r>   r4   r3   rV   r?   s       r-   r9   zJetMoeTopKGating.__init__j   s:     	&$
YYz;UC
r,   c                    | j                  |      j                         }|j                  | j                  d      \  }}t	        j
                  |d      j                  |      }t	        j                  |j                  d      | j                  g|j                  |j                        }|j                  d|d      }|j                         j                  d      }|j                         }|j!                         }	|	j#                  d      \  }
}|j%                  | j                  d      }|j!                         }||   }|||||fS )Nr#   rA   r   dtypedevicetrunc)rounding_mode)r[   floattopkrV   r;   softmaxtype_aszerossizer3   r^   r_   scatterlongsumtolistflattensortdiv)r>   hidden_stateslogitstop_k_logitstop_k_indicestop_k_gatesrf   gatesrJ   top_k_experts_index_sorted_expertsbatch_indexbatch_gatess                 r-   rO   zJetMoeTopKGating.forward~   s.   M*002&,kk$**!k&D#mmmLa8@@O a $"2"23;;L;LU`UgUg
 a2jjl&&q) "((* &--/"/"4"4Q"7*..tzz.Q "))+!"67#[+{FRRr,   rP   rS   s   @r-   rU   rU   i   s'    D3 DS D D(Sr,   rU   c                   .     e Zd ZdZdef fdZd Z xZS )	JetMoeMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 @   t         |           |j                  | _        |j                  | _        t
        |j                     | _        t        j                  j                  t        j                  | j                              | _        t        |j                  | j                  | j                  dz        | _        t        |j                  | j                  | j                        | _        t#        | j                  |j                  |j$                        | _        y )Nr   r4   r3   rV   )r8   r9   hidden_sizer4   intermediate_sizer   activation_function
activationr;   r   r:   r<   rY   r2   num_local_expertsinput_linearoutput_linearrU   num_experts_per_tokrouterr>   r|   r?   s     r-   r9   zJetMoeMoE.__init__   s     ,,!33 !;!;<HH&&u{{4??'CD	1&2J2JDOO]a]m]mpq]qr263K3KTM]M]_c_n_no&00,,
r,   c                 8   |j                         \  }}}|j                  d|      }| j                  |      \  }}}}}	||   }
| j                  |
|      }|j	                  dd      }| j                  |d         |d   z  }| j                  ||      }||dddf   z  }t        j                  ||z  | j                  f|j                  |j                        }|j                  d||      }|j                  ||| j                        }|| j                  z   }|S )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r   rA   r   r#   Nr]   )rg   reshaper   r   chunkr   r   r;   rf   r4   r^   r_   	index_addviewrY   )r>   layer_inputbszlengthemb_sizerv   rx   ry   rJ   router_logitsexpert_inputsro   chunked_hidden_statesexpert_outputsrf   layer_outputs                   r-   rO   zJetMoeMoE.forward   s%    !, 0 0 2VX!))"h7BF++kBZ?;[-#K0))-E - 3 3A2 3 >(=a(@ADYZ[D\\++M;G'+ag*>>S6\4??;>CWCW`n`u`uvq+~F#((fdooF#dii/r,   )r(   r)   r*   __doc__r$   r9   rO   rR   rS   s   @r-   r{   r{      s    
| 
 r,   r{   c                   :     e Zd ZdZdef fdZd Zd Zd Z xZ	S )	JetMoeMoAz
    A Sparsely gated mixture of attention layer with pairs of query- and output-projections as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r|   c                 h   t         |           |j                  | _        |j                  | _        |j                  |j                  z  | _        |j                  | _	        t        j                  j                  t        j                  | j
                              | _        t        | j                  | j
                  | j                        | _        t        | j                  | j                  | j
                        | _        t%        | j
                  | j                  | j                        | _        y )Nr~   )r8   r9   r   r3   r   r4   kv_channelsnum_key_value_headsr   rV   r;   r   r:   r<   rY   r2   r   r   rU   r   r   s     r-   r9   zJetMoeMoA.__init__   s    !33 ,,!--0J0JJ//
HH&&u{{4??'CD	1$2B2BDOOUYUeUef243C3CTEUEUW[WfWfg&((**
r,   c                    |j                         \  }}}|j                  d|      }| j                  |      \  }}}}}	||||f}
||   }| j                  ||      }t	        j
                  ||z  | j                  z  | j                  f|j                  |j                        }|j                  d||      }|j                  ||| j                  d      }||	|
fS )z
        Map inputs to attention experts according to routing decision and compute query projection inside each experts.
        r   r]   r   )rg   r   r   r   r;   rf   rV   r   r^   r_   r   r   )r>   r   r   r   r   rw   rx   ry   rJ   r   	topo_infor   r   rf   r   s                  r-   mapzJetMoeMoA.map   s     !, 0 0 2VX!))"h7UYU`U`alUmRk;]);[Q	 $K0**=+F 6\DJJ&(8(89AUAU^l^s^s
 q*>O#((fdjj"E]I55r,   c                    |j                         \  }}}}|j                  d|      }|\  }}}	}
||   }| j                  ||
      }||	dddf   z  }t        j                  ||z  | j
                  f|j                  |j                        }|j                  d||      }|j                  ||| j
                        }|| j                  z   }|S )zu
        Compute output projection inside each attention experts and merge the outputs of different experts.
        r   Nr]   r   )rg   r   r   r;   rf   r4   r^   r_   r   r   rY   )r>   r   r   r   r   kr   rw   rx   ry   rJ   r   r   rf   r   s                  r-   reducezJetMoeMoA.reduce  s     '2&6&6&8#VQ!))"k:FOCk; $$89++M;G (+ag*>> S6\4??;>CWCW`n`u`uvq+~F#((fdooF#dii/r,   c                     t        d      )Nz-This module doesn't support call and forward.)NotImplementedError)r>   r   s     r-   rO   zJetMoeMoA.forward  s    !"QRRr,   )
r(   r)   r*   r   r$   r9   r   r   rO   rR   rS   s   @r-   r   r      s$    
| 
$6.,Sr,   r   c                        e Zd ZdZddededz  f fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  deej                  ej                  dz  eej                     dz  f   fdZ xZS )JetMoeAttentionzH
    Multi-headed attention from 'Attention Is All You Need' paper.
    Nr|   	layer_idxc                 b   t         |           || _        || _        d| _        |-t
        j                  d| j                  j                   d       d| _	        |j                  | _        |j                  | _        |j                  |j                  z  | _        |j                  | _        |j                   | _        |j                  | _        | j$                  dz  | _        t)        |      | _        t,        j.                  j1                  |j2                  | j                  dz  d	      | _        y)
z
        Initialize the JetMoeAttention module.

        Args:
            config:
                Configuration object with model hyperparameters.
            layer_idx:
                Index of the layer in the model.
        TNzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r#   g      r   FrX   )r8   r9   r|   r   	is_causalloggerwarning_oncer?   r(   num_key_value_groupsr   rV   attention_dropoutr   r   kv_projection_sizenum_attention_heads	num_headshead_dimscalingr   expertsr;   r   rZ   r   kv_projr>   r|   r   r?   s      r-   r9   zJetMoeAttention.__init__%  s    	" !8!8 9 :, , %&!//
!'!9!9"("4"4v7Q7Q"Q#)#=#= 33**}}d* (xxv'9'94;R;RUV;V]bcr,   ro   attention_maskposition_embeddingspast_key_valuescache_positionr6   c                    |j                   d d }g |d| j                  }| j                  j                  |      \  }	}
}| j	                  |      j                  dd      \  }}|	j                  |      j                  dd      }	|j                  |      j                  dd      }|j                  |      j                  dd      }|\  }}t        |	|||      \  }	}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                  t              }|j!                  d| j"                  dd      }|j!                  d| j"                  dd      } || |	|||f| j$                  sdn| j&                  | j(                  d|\  }} |j                  g || j"                  d }| j                  j+                  ||      } |j                  g |d }|||
fS )Nr   r   rA   r#   )sincosr           )dropoutr   )shaper   r   r   r   r   r   	transposer    updater   r   get_interfacer|   _attn_implementationr!   repeatrV   trainingr   r   r   )r>   ro   r   r   r   r   kwargsinput_shapehidden_shapequery_statesr   r   
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                       r-   rO   zJetMoeAttention.forwardF  s    $))#2.88b8$--8151A1A-1P.mY#'<<#>#D#DQB#D#O 
L#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
  &&q$**a;
#**1djj!Q?$7	%
  $}}C$2H2HLL	%
 	%
!\ 'k&&DDTZZDDll))+yA&k&&88R8L-77r,   N)NNNN)r(   r)   r*   r   r$   rQ   r9   r;   Tensor
LongTensorr	   tuplerO   rR   rS   s   @r-   r   r      s    d| dd
 dH /37;(,2628||28 t+28 #--4	28
 28 ((4/28 
u||U\\D0%2E2LL	M28r,   r   c                   *    e Zd Zddededz  f fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )JetMoeDecoderLayerNr|   r   c                     t         |   ||       t        |j                        | _        t        ||      | _        t        |j                        | _        t        |      | _	        | `
y r   )r8   r9   r&   r   input_layernormr   self_attentionpost_attention_layernormr{   mlp	self_attnr   s      r-   r9   zJetMoeDecoderLayer.__init__|  sX    +,V-?-?@-fi@(5f6H6H(I%V$Nr,   ro   r   position_idsr   	use_cacher   r   r   r6   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )N)ro   r   r   r   r   r   r   r+   )r   r   r   r   )r>   ro   r   r   r   r   r   r   r   residualrv   s              r-   rO   zJetMoeDecoderLayer.forward  s     !,,];1d11 	
')%+) 3	
 	
q! !=0 !55mD/ =0r,   r   )NNNFNN)r(   r)   r*   r$   rQ   r9   r;   r   r   r	   boolr   r   r   rO   rR   rS   s   @r-   r   r   {  s    | d
  /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
r,   r   c                       e Zd ZU  eed       eed      ge eed      dZee	d<   dZ
dZd	gZd
gZdZdZdZ ej$                         d        Zy)JetMoePreTrainedModelr   )index   r#   )r   ro   
attentionsr|   modelFr   r   Tc                 ,   t        j                  | |       t        |t              r7t	        j
                  |j                  d| j                  j                         yt        |t        t        z        r t	        j                  |j                         yy)zInitialize the weights.r   )meanstdN)r   _init_weights
isinstancer2   initnormal_r=   r|   initializer_ranger   r{   zeros_rY   )r>   modules     r-   r   z#JetMoePreTrainedModel._init_weights  sa     	%%dF3f34LLSdkk6S6ST	I 56KK$ 7r,   N)r(   r)   r*   r   r   rU   r   _can_record_outputsr$   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr;   no_gradr   r+   r,   r-   r   r     s     )BNScklDmn+$_A>
 &+#-.#4"5N"U]]_% %r,   r   c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   defd                     Z xZS )JetMoeModelr|   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        |j                  | _        t        |j                  |j                         | _        y c c}w )N)eps)r8   r9   pad_token_idpadding_idx
vocab_sizer   	Embeddingr   embed_tokens
ModuleListrD   num_hidden_layersr   layersr   r&   rms_norm_epsnormr   s      r-   r9   zJetMoeModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammDI&JbJbDcdy	2d
 %+$?$?!!&"4"4&:M:MN	 es   C*N	input_idsr   r   r   inputs_embedsr   r   r   r6   c                 D   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embeds)r|   r   r#   )r_   )r|   r  r   r   r   r   )r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorr
   r|   r  get_seq_lengthr;   aranger   r_   	unsqueezer   
rotary_embr  r  r	  r   )r>   r
  r   r   r   r  r   r   r   past_seen_tokenscausal_maskro   r   decoder_layers                 r-   rO   zJetMoeModel.forward  s`    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7* /#-)	 	M
	 		-0%++
 	
r,   )NNNNNNN)r(   r)   r*   r$   r9   r   r   r   r;   r   r   r	   FloatTensorr   r   r   r   rO   rR   rS   s   @r-   r   r     s    
O| 
O   .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
    ;
r,   r   c                   L    e Zd ZddiZ fdZee	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  dej                  dz  deej                  z  dedz  defd              Z xZS )JetMoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                 p   t         |   |       t        |      | _        |j                  | _        |j
                  | _        t        j                  |j                  |j                  d      | _	        |j                  | _
        |j                  | _        |j                  | _        | j                          y )NFrX   )r8   r9   r   r   r  aux_loss_coefr   rZ   r   lm_headtie_word_embeddingsr   r3   r   	post_initr   s     r-   r9   zJetMoeForCausalLM.__init__  s      (
 ++#11yy!3!3V5F5FUS#)#=#= !33#)#=#=  	r,   Nr
  r   r   r   r  labelsr   r   logits_to_keepoutput_router_logitsr6   c                 N    | j                   d||||||||
d|}|j                  }t        |	t              rt	        |	 d       n|	}| j                  |d d |d d f         }d }|* | j                  ||fd| j                  j                  i|}d }|
rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                        z  z  }t!        ||||j"                  |j$                  |j&                  |j                        S )N)r
  r   r   r   r  r   r   r   r  )lossaux_lossrp   r   ro   r   r   r+   )r   r  r   rQ   slicer  loss_functionr|   r  r"   r   r3   r   r  tor_   r   r   ro   r   )r>   r
  r   r   r   r  r  r   r   r  r   r   outputsro   slice_indicesrp   r"  r#  s                     r-   rO   zJetMoeForCausalLM.forward  sT     +5$** 
+
)%+')!5
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%  ;;11 	D /%%  ((	H !**X[[-EEE(#33!//))!//
 	
r,   )
NNNNNNNNr   F)r(   r)   r*   _tied_weights_keysr9   r   r   r;   r   r   r	   r  r   rQ   r   rO   rR   rS   s   @r-   r  r    s   *,GH  .2.204(,26*.!%26-.,1;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
   4';
 $;;
 ((4/;
 ell*;
 #Tk;
 
#;
  ;
r,   r  c                       e Zd Zy)JetMoeForSequenceClassificationNr'   r+   r,   r-   r+  r+  _  s    r,   r+  )r  r   r   r+  )Er   collections.abcr   r;   r   torch.nnr   rF    r   r   activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   llama.modeling_llamar   mixtral.modeling_mixtralr   r   r   r   r    r!   r"   configuration_jetmoer$   
get_loggerr(   r   r&   r0   Moduler2   rU   r{   r   r   r   r   r   r  r+  __all__r+   r,   r-   <module>r@     sU    $   $ & ! . ) / R F & R R 7 E 4   / 
		H	%	N 		2 	*BII *Z.Sryy .Sb7		 7tIS		 ISXX8bii X8v(* (V %2 % %2 K
, K
 K
\M
- M
` d&FH] c kr,   