
    iZ1                     p   d dl Z ddlmZmZ ddlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZmZ d	d
lmZ d	dlmZmZmZmZ d	dlmZmZ  ej4                  e      Z G d de      Z G d de      Z G d de      Z G d de      Z  G d de      Z! G d de
      Z" G d de      Z# G d de      Z$g dZ%y)    N   )CacheDynamicCache)layer_type_validation)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)Unpack)TransformersKwargslogging   )LlamaConfig)LlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel)Qwen2AttentionQwen2RotaryEmbeddingc            ,            e Zd ZdZdZdZdddddd	d
ddddddg dddddddddfdededededededededed ed!ed"e	d#edz  d$ed%e	d&ed'ed(e	d)e
dz  d*ed+ee   dz  f* fd,Z xZS )-	CwmConfiga  
    Configuration for Code World Model (CWM).
    This is an inherited Llama3-compatible configuration with layer-interleaved
    sliding-window attention. Configures a `CwmModel`. Designed to yield a configuration mirroring the model in the
    [facebook/cwm](https://huggingface.co/facebook/cwm) architecture by default. Other models include:
    - [facebook/cwm-sft](https://huggingface.co/facebook/cwm-sft)
    - [facebook/cwm-pretrain](https://huggingface.co/facebook/cwm-pretrain)

    Args:
        vocab_size (`int`, *optional*, defaults to 128256):
            Vocabulary size of the CWM model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CwmModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations
        num_hidden_layers (`int`, *optional*, defaults to 64):
            Number of hidden layers in the Transformer decoder
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the Transformer decoder
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention (GQA).
            If it is not specified, will default to `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 128):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with. CWM's attention allows sequence
            lengths up to 131072 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[128001, 128008, 128009]`):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
        bos_token_id (`int`, *optional*, defaults to 128000):
            The id of the *beginning-of-sequence* token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Tensor parallelism degree used during pretraining. See [this
            document](https://huggingface.co/docs/transformers/parallelism) and [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        sliding_window (`int`, *optional*, defaults to 8192):
            Sliding window attention window size.
        layer_types (`List[str]`, *optional*):
            List of layer types for each layer. Each element should be either "full_attention" or "sliding_attention".
            If not specified, will default to alternating pattern based on the provided window pattern.
    cwm    .Ai  i   i T  @   0         silui   g{Gz?gh㈵>TN)i i i	 i  Fg               
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_idtie_word_embeddingsattention_dropoutpretraining_tpmlp_biasrope_parameterssliding_windowlayer_typesc                    |	ddddddd}|%d}t        |      D cg c]  }||z  d	k(  rd
nd }}nt        ||       |rt        |      nd | _        t	        |      | _        t        |   d"i d|d|d|d|d|d|d|d|d|	d|
d|d|d|dt	        |      d|d|ddd|d|d |d!|| | `y c c}w )#Nr   g      0@g      @g      ?r   llama3)
rope_thetafactorhigh_freq_factorlow_freq_factor original_max_position_embeddings	rope_type   r   full_attentionsliding_attentionr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   eos_token_idr-   r.   attention_biasFr/   r2   r0   r1    )	ranger   intr3   listr4   super__init__rA   )selfr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r@   r-   r.   r/   r0   r1   r2   r3   r4   kwargswindow_patterni	__class__s                             _/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/cwm/modular_cwm.pyrG   zCwmConfig.__init__i   s   6 ")$'#&48%O N 01 '(.&8A&=!DWWK 
 "+/@A5Cc.1, 	
!	
#	
 0	
 0		

 !4	
 !4	
 	
 "	
 %<	
 0	
 &	
  	
 &	
 l+	
 &	
  !4!	
" !#	
$ 0%	
& ,'	
( *)	
* -	
4 Is   B>)__name__
__module____qualname____doc__
model_typedefault_thetarD   strfloatbooldictrE   rG   __classcell__rL   s   @rM   r   r   %   s[   >@ JM !!&!##%#$ '-#'"#'-"$)#&'+"(,1L L  L  	L 
 L  !L  !L  L  L  "%L  !L  L  L  DjL   !L " "#L $ !%L & 'L ( )L * +L . /L 0 #Y%1L  L     r   c                       e Zd Zy)CwmRotaryEmbeddingNrN   rO   rP   rB   rZ   rM   r\   r\          rZ   r\   c                   (     e Zd Zdedef fdZ xZS )CwmAttentionconfig	layer_idxc                    t         |   ||       t        j                  j	                  |j
                  |j                  | j                  z  d      | _        t        j                  j	                  |j
                  |j                  | j                  z  d      | _
        t        j                  j	                  |j
                  |j                  | j                  z  d      | _        y )Nra   rb   F)bias)rF   rG   torchnnLinearr!   r$   r&   q_projr%   k_projv_projrH   ra   rb   rL   s      rM   rG   zCwmAttention.__init__   s    )<hhoof&8&8&:T:TW[WdWd:dkpoqhhoof&8&8&:T:TW[WdWd:dkpoqhhoof&8&8&:T:TW[WdWd:dkpoqrZ   rN   rO   rP   r   rD   rG   rX   rY   s   @rM   r`   r`      s    ry rS r rrZ   r`   c                   (     e Zd Zdedef fdZ xZS )CwmDecoderLayerra   rb   c                 t    t         |   ||       |j                  |   | _        t	        ||      | _        y )Nrd   )rF   rG   r4   attention_typer`   	self_attnrl   s      rM   rG   zCwmDecoderLayer.__init__   s6    )<$00;%VyIrZ   rm   rY   s   @rM   ro   ro      s    Jy JS J JrZ   ro   c                       e Zd Zy)CwmPreTrainedModelNr]   rB   rZ   rM   rt   rt      r^   rZ   rt   c                       e Zd Zy)CwmModelOutputWithPastNr]   rB   rZ   rM   rv   rv      r^   rZ   rv   c                        e Zd ZeZdef fdZ	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  dej                  dz  d	ej                  dz  d
edz  dee   defdZ xZS )CwmModelra   c           	          t         |   |       t        j                  j	                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )N)	rF   rG   rf   rg   
ModuleListrC   r#   ro   layersrl   s      rM   rG   zCwmModel.__init__   sI     hh))AFvG_G_A`aI_VY/a
as   A!N	input_idsattention_maskposition_idspast_key_valuesinputs_embedscache_positionr+   rI   returnc           
         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|E||j	                         nd}	t        j                  |j                  d   |j                        |	z   }||j                  d      }t        |x}
t              s:| j                  |||||d}|j                         }t        d
i |t        d
i |d}
|}| j                  ||      }| j                   d | j                  j"                   D ]  } ||f|
|j$                     ||||d|}  | j'                  |      }t)        ||	      S )Nz:You must specify exactly one of input_ids or inputs_embeds)ra   r   r   )device)ra   r   r}   r   r   r~   )r>   r?   )r}   r~   r   r   position_embeddings)last_hidden_stater   rB   )
ValueErrorembed_tokensr   ra   get_seq_lengthrf   arangeshaper   	unsqueeze
isinstancerW   copyr   r   
rotary_embr{   r#   rq   normrv   )rH   r|   r}   r~   r   r   r   r+   rI   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargshidden_statesr   decoder_layers                   rM   forwardzCwmModel.forward   s    -t";<YZZ *.*;*;I*FM0*$++>O!CRC^==?de]003M<P<PQTdd  )33A6L?-F++!."0"0#2 ,K #."2"2"4 #5"C{"C%F%]I\%]#
 &"oom\J![[)H4;;+H+HI 		M)2=3O3OP) /-$7 M		 		-0%++
 	
rZ   )NNNNNNN)rN   rO   rP   r   config_classrG   rf   
LongTensorTensorr   FloatTensorrV   r
   r   rv   r   rX   rY   s   @rM   rx   rx      s    L
y 
 .2.204(,2626!%?
##d*?
 t+?
 &&-	?

 ?
 ((4/?
 ((4/?
 $;?
 +,?
 
 ?
rZ   rx   c                       e Zd Zy)CwmForCausalLMNr]   rB   rZ   rM   r   r     r^   rZ   r   )r   rt   rx   r   )&rf   cache_utilsr   r   configuration_utilsr   masking_utilsr   r   modeling_outputsr	   processing_utilsr
   utilsr   r   llama.configuration_llamar   llama.modeling_llamar   r   r   r   qwen2.modeling_qwen2r   r   
get_loggerrN   loggerr   r\   r`   ro   rt   rv   rx   r   __all__rB   rZ   rM   <module>r      s      . 8 R 7 & 0 3  H 
		H	%P  P f	- 	r> rJ' J	- 		4 	H
z H
V	% 	rZ   