
    i%                     0    d dl mZmZ  G d de      ZdgZy)   )PreTrainedConfiglayer_type_validationc            ,       
    e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZdZdddddddddddddg dddddddddfd e	d!e	d"e	d#e	d$e	d%e	d&e	d'e
d(e	d)ed*ed+ed,e	dz  d-e	d.ed/ed0e	d1ed2edz  d3e	d4ee
   dz  f* fd5Z xZS )6	CwmConfiga  
    Configuration for Code World Model (CWM).
    This is an inherited Llama3-compatible configuration with layer-interleaved
    sliding-window attention. Configures a `CwmModel`. Designed to yield a configuration mirroring the model in the
    [facebook/cwm](https://huggingface.co/facebook/cwm) architecture by default. Other models include:
    - [facebook/cwm-sft](https://huggingface.co/facebook/cwm-sft)
    - [facebook/cwm-pretrain](https://huggingface.co/facebook/cwm-pretrain)

    Args:
        vocab_size (`int`, *optional*, defaults to 128256):
            Vocabulary size of the CWM model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CwmModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations
        num_hidden_layers (`int`, *optional*, defaults to 64):
            Number of hidden layers in the Transformer decoder
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the Transformer decoder
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention (GQA).
            If it is not specified, will default to `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 128):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with. CWM's attention allows sequence
            lengths up to 131072 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[128001, 128008, 128009]`):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
        bos_token_id (`int`, *optional*, defaults to 128000):
            The id of the *beginning-of-sequence* token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Tensor parallelism degree used during pretraining. See [this
            document](https://huggingface.co/docs/transformers/parallelism) and [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        sliding_window (`int`, *optional*, defaults to 8192):
            Sliding window attention window size.
        layer_types (`List[str]`, *optional*):
            List of layer types for each layer. Each element should be either "full_attention" or "sliding_attention".
            If not specified, will default to alternating pattern based on the provided window pattern.
    cwmpast_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm    .Ai  i   i T  @   0         silui   g{Gz?gh㈵>TN)i i i	 i  Fg               
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_idtie_word_embeddingsattention_dropoutpretraining_tpmlp_biasrope_parameterssliding_windowlayer_typesc                 H   |	ddddddd}|%d}t        |      D cg c]  }||z  d	k(  rd
nd }}nt        ||       |rt        |      nd | _        t	        |      | _        || _        |	| _        || _        || _	        || _
        || _        ||}|| _        || _        |
| _        || _        || _        || _        || _        || _        ||n| j                  | j                  z  | _        || _        || _        || _        || _        || _        t5        | l  di | y c c}w )Nr   g      0@g      @g      ?r   llama3)
rope_thetafactorhigh_freq_factorlow_freq_factor original_max_position_embeddings	rope_type       full_attentionsliding_attention )ranger   intr-   listr.   r   r"   r   r   r   r   r   r!   r#   r$   r*   r%   r)   r+   r    r,   r(   r&   r'   eos_token_idsuper__init__)selfr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r?   r'   r(   r)   r*   r+   r,   r-   r.   kwargswindow_patterni	__class__s                             e/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/cwm/configuration_cwm.pyrA   zCwmConfig.__init__m   sl   6 ")$'#&48%O N 01 '(.&8A&=!DWWK 
 "+/@A5Cc.1,$'>$&!2!2#6  &"5#6 $!2(,"!2 $,$8d>N>NRVRjRj>j.#6 ((("6"Gs   D)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_plandefault_thetar=   strfloatbooldictr>   rA   __classcell__)rF   s   @rG   r   r      s   >@ J#4"5 &/%.%.%."+ )"+ &(9:#%568IJ!"_$56
  M !!&!##%#$ '-#'"#'-"$)#&'+"(,1K#K# K# 	K#
 K# !K# !K# K# K# "%K# !K# K# K# DjK#  !K#" "#K#$ !%K#& 'K#( )K#* +K#. /K#0 #Y%1K# K#    r   N)configuration_utilsr   r   r   __all__r;   rV   rG   <module>rY      s$   , K_#  _#D -rV   