
    i,                     "   d dl Z d dl mZ ddlmZmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ d	d
lmZmZmZmZmZ ddlmZ  ej.                  e      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Zg dZy)    N)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   4     e Zd ZdZddededz  f fdZ xZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                 H    t         |   ||       |j                  | _        y N)super__init__attention_multiplierscalingselfr   r   	__class__s      g/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/granite/modular_granite.pyr   zGraniteAttention.__init__(   s    +22    r   )__name__
__module____qualname____doc__r   intr   __classcell__r!   s   @r"   r   r   %   s"    G3} 3t 3 3r#   r   c                   f    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
e
dz  dej                  dz  deej                  ej                  f   dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )GraniteDecoderLayerr   r   c                 l    t         |   ||       |j                  | _        t        ||      | _        y )N)r   r   )r   r   residual_multiplierr   	self_attnr   s      r"   r   zGraniteDecoderLayer.__init__.   s.    +#)#=#= )9Mr#   Nhidden_statesattention_maskposition_idspast_key_valuesoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    |}
| j                  |      } | j                  d||||||||d|	\  }}|
|| j                  z  z   }|}
| j                  |      }| j	                  |      }|
|| j                  z  z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r0   r1   r2   r3   r4   r5   r6   r7    )input_layernormr/   r.   post_attention_layernormmlp)r    r0   r1   r2   r3   r4   r5   r6   r7   kwargsresidualself_attn_weightsoutputss                r"   forwardzGraniteDecoderLayer.forward3   s    D !,,]; ,:4>> 
,
')%+/) 3
,
 
,
(( !=43K3K#KK !55mD/ =43K3K#KK ")++Gr#   )NNNFFNN)r$   r%   r&   r   r(   r   torchTensor
LongTensorr   booltupleFloatTensorrB   r)   r*   s   @r"   r,   r,   -   s    N} N N /304(,).!&26HL?||? t+? &&-	?
 ?  $;? $;? ((4/? #5<<#=>E? 
u  %(9(95;L;L(L"MPT"TT	U?r#   r,   c                       e Zd Zy)GranitePreTrainedModelN)r$   r%   r&   r:   r#   r"   rJ   rJ   u   s    r#   rJ   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
e
dz  de
dz  dej                  dz  dee   defdZ xZS )GraniteModelr   c           	          t         |   |       |j                  | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r   )	r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layersr,   layersr   s      r"   r   zGraniteModel.__init__z   sR     $*$?$?!mmEJ6KcKcEde	 3e
es   A(N	input_idsr1   r2   r3   inputs_embedsr5   r4   output_hidden_statesr6   r>   r8   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|| j                  z  }|r|t        | j                         }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }t%        | j                   |||	||      }|}| j'                  ||	      }|rd
nd }|rd
nd }| j(                  d | j                   j*                   D ],  }|r||fz  } ||f||||||	|d|
}|d   }|s$||d   fz  }. | j-                  |      }|r||fz  }t/        ||r|nd ||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)r   r   r   )device)r   rT   r1   r6   r3   r2   )r2   r:   )r1   r2   r3   r4   r5   r6   r7   )last_hidden_stater3   r0   
attentions)r   r4   rU   r5   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrN   r   get_seq_lengthrC   arangeshaperW   	unsqueezer   
rotary_embrR   rQ   normr   )r    rS   r1   r2   r3   rT   r5   r4   rU   r6   r>   past_seen_tokenscausal_maskr0   r7   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r"   rB   zGraniteModel.forward   sN    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%(A(AA0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 &"oom,oW #7BD0d![[)H4;;+H+HI 	6M#!m%55!)
*) /"3#-$7
 
M *!,M =#3"55'	6* 		-0  -!11&+/8Od+%	
 	
r#   )	NNNNNNNNN)r$   r%   r&   r   r   rC   rE   rD   r   rH   rF   r
   r   r   rB   r)   r*   s   @r"   rL   rL   y   s    
} 
 .2.204(,26!%)-,026]
##d*]
 t+]
 &&-	]

 ]
 ((4/]
 $;]
  $;]
 #Tk]
 ((4/]
 +,]
 
!]
r#   rL   c                   4   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej
                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  dej                  dz  de	ej
                  z  de
e   defdZy)GraniteForCausalLMNrS   r1   r2   r3   rT   labelsr5   r4   rU   r6   logits_to_keepr>   r8   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d       n|}| j                  |d d |d d f         }|| j                   j                  z  }d }|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )N)	rS   r1   r2   r3   rT   r5   r4   rU   r6   )logitsrn   
vocab_size)lossrq   r3   r0   rY   r:   )r   r4   rU   modelrX   
isinstancer(   slicelm_headlogits_scalingloss_functionrr   r	   r3   r0   rY   )r    rS   r1   r2   r3   rT   rn   r5   r4   rU   r6   ro   r>   rA   r0   slice_indicesrq   rs   s                     r"   rB   zGraniteForCausalLM.forward   s.    2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A$++444%4%%pVFt{{OeOepiopD%#33!//))
 	
r#   )NNNNNNNNNNr   )r$   r%   r&   rC   rE   rD   r   rH   rF   r(   r
   r   r	   rB   r:   r#   r"   rm   rm      s	    .2.204(,26*.!%)-,026-.2
##d*2
 t+2
 &&-	2

 2
 ((4/2
   4'2
 $;2
  $;2
 #Tk2
 ((4/2
 ell*2
 +,2
 
 2
r#   rm   )rm   rL   rJ   ) rC   r   cache_utilsr   r   masking_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr$   r]   r   r,   rJ   rL   rm   __all__r:   r#   r"   <module>r      s       . / O & 0  1 
		H	%3~ 3E+ EP	1 	e
: e
P3
) 3
l Kr#   