
    묜i8                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZ  ej@                  e!      Z" G d de	      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d d e      Z+ G d! d"e      Z,g d#Z-y)$    )CallableN)nn   )ACT2CLS)Cache)PreTrainedConfig)RopeParameters)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)NemotronMLPc            &       R    e Zd ZdZdZdgZdZdddddddZdgd	gfd
dgd
gfd
gd
gfdZdddddddddddddddddddddd dd!fd"e	dz  d#e	dz  d$e	dz  d%e	dz  d&e	dz  d'e	dz  d(e
dz  d)e	dz  d*edz  d+edz  d,edz  d-e	dz  d.e	dz  d/e	dz  d0edz  d1edz  d2edz  d3edz  f$ fd4Z xZS )5ApertusConfiga  
    This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Apertus-8B.
    e.g. [swiss-ai/Apertus-8B](https://huggingface.co/swiss-ai/Apertus-8B)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the Apertus model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ApertusModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"xielu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 65536):
            The maximum sequence length that this model might ever be used with. Apertus supports up to 65536 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 3):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    ```python
    >>> from transformers import ApertusModel, ApertusConfig

    >>> # Initializing a Apertus-8B style configuration
    >>> configuration = ApertusConfig()

    >>> # Initializing a model from the Apertus-8B style configuration
    >>> model = ApertusModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```apertuspast_key_valuesg    `fAcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   i   i 8      Nxielui   g{Gz?gh㈵>Tr      r   Fllama3g       @i    g      ?g      @)	rope_type
rope_thetafactor original_max_position_embeddingslow_freq_factorhigh_freq_factor        
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsrope_parametersattention_biasattention_dropoutc                 (   || _         || _        || _        || _        || _        || _        ||}|| _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        t%        | L  di | y )N )r2   r9   r3   r4   r5   r6   r7   r8   r:   r;   r<   rB   rC   rA   r@   r=   r>   r?   super__init__)selfr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   kwargs	__class__s                       g/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/apertus/modular_apertus.pyrG   zApertusConfig.__init__   s    : %'>$&!2!2#6  &"5#6 $!2(",!2.#6 ((("6"    )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planintstrfloatboolr	   rG   __classcell__rJ   s   @rK   r   r   -   s   EN J#4"5M%.%.%.%. )"+ &(9:#%568IJ!"_$56 "("&(-(**,*.!(.3*.%)!%#$#$#$+0!$04" #2
 ',*-55#$J5# 4Z5# :	5#
 :5# !4Z5# !4Z5# $J5# "%t5# !4<5# dl5# $;5# Dj5# Dj5# Dj5#  "D[!5#" ($.#5#2 t35#4 !4<55# 5#rL   r   c                        e Zd Z fdZ xZS )
ApertusMLPc                 D   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        |j                  dk(  rt        d   |j                        | _        y y )NF)biasr(   )dtype)rF   rG   r   Linearr3   r4   up_proj	down_projr8   r   r`   act_fn)rH   configrJ   s     rK   rG   zApertusMLP.__init__   sz     yy!1!143I3IPUV4#9#94;K;KRWX'!'*>DK (rL   )rM   rN   rO   rG   rZ   r[   s   @rK   r]   r]      s    ? ?rL   r]   c                       e Zd Zy)ApertusRMSNormNrM   rN   rO   rE   rL   rK   rg   rg          rL   rg   c                       e Zd Zy)ApertusRotaryEmbeddingNrh   rE   rL   rK   rk   rk      ri   rL   rk   c                       e Zd Zddededz  f fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  f   fdZ xZS )ApertusAttentionNre   	layer_idxc                     t         |   ||       t        | j                  |j                        | _        t        | j                  |j                        | _        y N)rF   rG   rg   head_dimr;   q_normk_normrH   re   rn   rJ   s      rK   rG   zApertusAttention.__init__   sB    +$T]]F4G4GH$T]]F4G4GHrL   r"   position_embeddingsr#   r   cache_positionrI   returnc                 \   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  sdn| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr)   r   )sincosrv   r1   )dropoutscaling)shaperq   q_projview	transposek_projv_projrr   rs   r   updatern   r
   get_interfacere   _attn_implementationr   trainingrC   r}   reshape
contiguouso_proj)rH   r"   ru   r#   r   rv   rI   input_shapehidden_shapequery_states
key_statesvalue_statesr{   rz   cache_kwargsattention_interfaceattn_outputattn_weightss                     rK   forwardzApertusAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((rL   rp   )NN)rM   rN   rO   r   rV   rG   torchTensortupler   
LongTensorr   r   r   rZ   r[   s   @rK   rm   rm      s    I} It I )-26*)||*) #5<<#=>*) t+	*)
 *) ((4/*) +,*) 
u||U\\)	**)rL   rm   c                   (    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   deej                     fdZ xZS )ApertusDecoderLayerre   rn   c                     t         |   ||       t        |j                  |j                        | _        t        |j                  |j                        | _        | `| `y )N)eps)	rF   rG   rg   r3   r;   attention_layernormfeedforward_layernorminput_layernormpost_attention_layernormrt   s      rK   rG   zApertusDecoderLayer.__init__  sT    +#1&2D2D&J]J]#^ %3F4F4FFL_L_%`" )rL   Nr"   r#   position_idsr   r<   rv   ru   rI   rw   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )N)r"   r#   r   r   r<   rv   ru   rE   )r   	self_attnr   mlp)rH   r"   r#   r   r   r<   rv   ru   rI   residual_s              rK   r   zApertusDecoderLayer.forward  s     !00?)4>> 	
')%+) 3	
 	
q !=0 !22=A/ =0rL   )NNNFNN)rM   rN   rO   r   rV   rG   r   r   r   r   rY   r   r   r   r   rZ   r[   s   @rK   r   r     s    *} * * /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
u||	rL   r   c                       e Zd Zy)ApertusPreTrainedModelNrh   rE   rL   rK   r   r   ,  ri   rL   r   c                       e Zd Zy)ApertusModelNrh   rE   rL   rK   r   r   0  ri   rL   r   c                        e Zd Z fdZ xZS )ApertusForCausalLMc                 "    t        |   di |S )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ApertusForCausalLM

        >>> model = ApertusForCausalLM.from_pretrained("swiss-ai/Apertus-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("swiss-ai/Apertus-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```rE   )rF   r   )rH   super_kwargsrJ   s     rK   r   zApertusForCausalLM.forward5  s    . w...rL   )rM   rN   rO   r   rZ   r[   s   @rK   r   r   4  s    / /rL   r   c                       e Zd Zy)ApertusForTokenClassificationNrh   rE   rL   rK   r   r   O  ri   rL   r   )r   r   r   r   r   ).collections.abcr   r   r   activationsr   cache_utilsr   configuration_utilsr   modeling_rope_utilsr	   modeling_utilsr
   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   nemotron.modeling_nemotronr   
get_loggerrM   loggerr   r]   rg   rk   rm   r   r   r   r   r   __all__rE   rL   rK   <module>r      s    %   "   3 1 5 & 0   5 
		H	%N#$ N#b? ?	\ 		1 	0)~ 0)f'+ 'T	1 		: 	/) /6	$? 	rL   