
    iJ                        d dl Z d dlZd dlmZ ddlmZ ddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZmZmZmZmZmZmZmZ d	dlmZ ddlmZ  ej@                  e!      Z"dZ#dZ$ G d de      Z%d Z& G d de      Z' G d dejP                        Z) G d de)      Z* G d de)      Z+e)e*e+dZ, G d de      Z- G d d e      Z. G d! d"e      Z/ G d# d$e      Z0 G d% d&e      Z1 G d' d(e      Z2 G d) d*e      Z3g d+Z4y),    N)nn   )initialization)CacheStaticCache)_flash_attention_forward!flash_attn_supports_top_left_mask)PreTrainedModel)logging   )GemmaForCausalLM)	LlamaDecoderLayerLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_emb	repeat_kv)
MistralMLP   )DiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                       e Zd Zy)DiffLlamaMLPN__name__
__module____qualname__     k/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.pyr   r   1       r!   r   c                 >    ddt        j                  d| z        z  z
  S )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r"   lambda_init_fnr(   5   s     txxy 01111r!   c                       e Zd Zy)DiffLlamaRotaryEmbeddingNr   r    r!   r"   r*   r*   9   r#   r!   r*   c                   \    e Zd ZdZddededz  f fdZ	 	 	 	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	ej                  dz  d
edz  dedej                  dz  de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )DiffLlamaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfigr'   c                    t         |           || _        || _        |-t        j                  d| j                  j                   d       |j                  | _        |j                  | _	        |j                  | _        t        |d| j                  | j                  z        | _        |j                  | _        | j                  | j                  z  | _        |j                   | _        d| _        t%        j&                  | j                  | j                  | j                  z  |j(                        | _        t%        j&                  | j                  | j                  | j                  z  |j(                        | _        t%        j&                  | j                  | j                  | j                  z  |j(                        | _        t%        j&                  | j                  | j                  z  | j                  |j(                        | _        t3        |      | _        t%        j6                  t9        j:                  d|j<                  | j                  f            | _        t%        j6                  t9        j:                  d|j<                  | j                  f            | _         t%        j6                  t9        j:                  d|j<                  | j                  f            | _!        t%        j6                  t9        j:                  d|j<                  | j                  f            | _"        t%        jF                  d| j                  z  |jH                  d	
      | _%        y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.head_dimT)biasr   )sizer   F)epselementwise_affine)&super__init__r-   r'   loggerwarning_once	__class__r   attention_dropouthidden_sizenum_attention_heads	num_headsgetattrr/   num_key_value_headsnum_key_value_groupsmax_position_embeddings	is_causalr   Linearattention_biasq_projk_projv_projo_projr(   lambda_init	Parametertorchnormallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormselfr-   r'   r8   s      r"   r5   zDiffLlamaAttention.__init__@   sq   " !8!8 9 :, , "(!9!9!--33
D4D4D4VW#)#=#= $(NNd6N6N$N!'-'E'E$ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm))4ell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdA$56;N;Nchir!   hidden_statesposition_embeddingsattention_maskposition_idspast_key_values	use_cachecache_positionreturnc                    |j                         \  }	}
}|
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|| j
                  | j                        j                  dd      }|j	                  |	|| j                  | j                        j                  dd      }|j	                  |	|| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }t        j$                  ||j                  dd            t'        j(                  | j                        z  }|||z   }t*        j,                  j/                  |dt        j0                        j3                  |j4                        }t*        j,                  j7                  || j8                  | j:                        }t        j<                  t        j>                  | j@                  | jB                  z  dt        j0                              j3                  |j4                        }t        j<                  t        j>                  | jD                  | jF                  z  dt        j0                              j3                  |j4                        }||z
  | jH                  z   }t        j$                  ||      }t        j                   |dd      \  }}|||z  z
  }d| jH                  z
  | jK                  |      z  }|j                  dd      jM                         }|jO                  |	|d      }| jQ                  |      }||fS )	Nr   r   sincosr\   dimr   rc   dtype)ptraining))r1   rD   rE   rF   viewr<   r/   	transposer>   r   updater'   r   r?   rJ   catchunkrepeatmatmulr%   sqrtr   
functionalsoftmaxfloat32torf   dropoutr9   rh   r&   sumrM   rN   rO   rP   rH   rS   
contiguousreshaperG   )rU   rV   rW   rX   rY   rZ   r[   r\   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statesra   r`   cache_kwargsattn_weightslambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                             r"   forwardzDiffLlamaAttention.forwarda   sX    +//1Z{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6||L*2F2Fq!2LMPTPYPYZ^ZgZgPhh%'.8L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<ll<>%*[[aQ%G"l"[<%??4+++t~~k/JJ!++Aq1<<>!))#ub9kk+.L((r!   NNNNFN)r   r   r   __doc__r   intr5   rJ   Tensortuple
LongTensorr   boolr   __classcell__r8   s   @r"   r,   r,   =   s    Gj j3: jJ /304(,26;)||;) #5<<#=>;) t+	;)
 &&-;) ;) ;) ((4/;) 
u||U\\D0%2E2LL	M;)r!   r,   c                       e Zd ZdZ fdZ	 	 	 	 	 ddej                  deej                  ej                  f   dej                  dz  dej                  dz  de	dz  d	e
d
ej                  dz  deej                  df   fdZ xZS )DiffLlamaFlashAttention2aN  
    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y r   )r4   r5   r	   _flash_attn_uses_top_left_mask)rU   argsry   r8   s      r"   r5   z!DiffLlamaFlashAttention2.__init__   s#    $)&)
 /P.Q+r!   NrV   rW   rX   rY   rZ   r[   r\   r]   c                 	   t        |t              rt        d      |j                         \  }}	}
| j	                  |      }| j                  |      }| j                  |      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}|j                  dd      }|j                  dd      }|j                  dd      }| j                  r| j                   nd}|j"                  }|j$                  j&                  dk7  r|j$                  j&                  nd}|t(        j*                  k(  rt)        j,                  |      rt)        j.                  |      }nMt1        | j2                  d      r| j2                  j"                  }n | j                  j4                  j"                  }t6        j9                  d	| d
       |j;                  |      }|j;                  |      }|j;                  |      }t)        j<                  |dd      \  }}|j?                  dddd      }|j?                  dddd      }tA        |||||	||tC        | dd       | jD                  | jF                  
      }tA        |||||	||tC        | dd       | jD                  | jF                  
      }t)        jH                  ||gd      }t)        j<                  |dd      \  }}t)        jJ                  t)        jL                  | jN                  | jP                  z  dt(        j*                              j;                  |j"                        }t)        jJ                  t)        jL                  | jR                  | jT                  z  dt(        j*                              j;                  |j"                        }||z
  | jV                  z   }|||z  z
  }d| jV                  z
  | jY                  |      z  }|j[                  ||	d      j]                         }| j_                  |      }|d fS )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r   r_           mpscpu_is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .rb   sliding_window)rY   ru   r   use_top_left_maskrA   rd   re   )0
isinstancer   
ValueErrorr1   rD   rE   rF   ri   r<   r/   rj   r>   r   rk   r'   rh   r9   rf   devicetyperJ   rs   is_autocast_enabledget_autocast_dtypehasattrr-   weightr6   r7   rt   rm   rn   r   r=   r   rA   rl   r&   rv   rM   rN   rO   rP   rH   rS   rx   rw   rG   )rU   rV   rW   rX   rY   rZ   r[   r\   rz   r}   r|   r~   r   r   ra   r`   r   dropout_rateinput_dtypedevice_typetarget_dtypevalue_states1value_states2r   r   r   r   r   r   s                                r"   r   z DiffLlamaFlashAttention2.forward   sT    o{3} 
 &**,UA{{=1[[/
{{=1
 $((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J $--a3))!Q/
#--a315t--C #((2>2E2E2J2Je2Sl))..Y^%--'((5$77Do6#{{00#{{1177 >$ (??<8L#|4J'??<8L',{{<'J$}%,,Q1a8%,,Q1a8/% "4)94@"AAnn
 0% "4)94@"AAnn
 ii| <"E%*[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!))#ub9DDFkk+.D  r!   r   )r   r   r   r   r5   rJ   r   r   r   r   r   r   r   r   s   @r"   r   r      s    R 3704(,26u!||u! #5<<#=>u! ((4/	u!
 &&-u! u! u! ((4/u! 
u||T!	"u!r!   r   c                   8   e Zd ZdZ	 	 	 	 	 ddej
                  deej
                  ej
                  f   dej
                  dz  dej                  dz  dedz  de	d	ej                  dz  d
eej
                  ej
                  dz  eej
                     dz  f   fdZ
y)DiffLlamaSdpaAttentiona   
    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    NrV   rW   rX   rY   rZ   r[   r\   r]   c                     |j                         \  }	}
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|
| j
                  | j                        j                  dd      }|j	                  |	|
| j                  | j                        j                  dd      }|j	                  |	|
| j                  | j                        j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                  t        j                   |dd      d      }|j#                  dddd      }|}||d d d d d d d |j$                  d   f   }|d u xr |
dkD  }t        j&                  j(                  j+                  ||||| j,                  r| j.                  nd|      }t        j                   |dd      \  }}t        j0                  t        j2                  | j4                  | j6                  z  dt        j8                  	            j;                  |j<                        }t        j0                  t        j2                  | j>                  | j@                  z  dt        j8                  	            j;                  |j<                        }||z
  | jB                  z   }|||z  z
  }d| jB                  z
  | jE                  |      z  }|j                  dd      jG                         }|j	                  |	|
d      }| jI                  |      }|d fS )
Nr   r   r_   rb   rd   r   )	attn_mask	dropout_prA   re   )%r1   rD   rE   rF   ri   r<   r/   rj   r>   r   rk   r'   r   r?   rJ   rl   rm   rn   shaper   rq   scaled_dot_product_attentionrh   r9   r&   rv   rM   rN   rs   rt   rf   rO   rP   rH   rS   rw   rG   )rU   rV   rW   rX   rY   rZ   r[   r\   ry   rz   r}   r|   r~   r   r   ra   r`   r   causal_maskrA   r   r   r   r   r   r   s                             r"   r   zDiffLlamaSdpaAttention.forward.  s(    &**,UA{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$Jz4+D+DE
 t/H/HIyy\1!!D"M#**1aA6$%%aA/E1A1A"1E/E&EFK  4'5EAI	hh))FF!04d,,3 G 
 &+[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!++Aq1<<>!&&sE26kk+.D  r!   r   )r   r   r   r   rJ   r   r   r   r   r   r   r    r!   r"   r   r   &  s     /304(,26B!||B! #5<<#=>B! t+	B!
 &&-B! B! B! ((4/B! 
u||U\\D0%2E2LL	MB!r!   r   )eagerflash_attention_2sdpac                   (     e Zd Zdedef fdZ xZS )DiffLlamaDecoderLayerr-   r'   c                 d    t         |   ||       t        |j                     ||      | _        y )N)r-   r'   )r4   r5   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnrT   s      r"   r5   zDiffLlamaDecoderLayer.__init__{  s-    +4V5P5PQY_ktur!   )r   r   r   r   r   r5   r   r   s   @r"   r   r   z  s    v v3 v vr!   r   c                   B    e Zd ZdZdZ ej                         d        Zy)DiffLlamaPreTrainedModelFc                    t        j                  | |       t        |t              rt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         y y )Nr   )r
   _init_weightsr   r,   initnormal_rM   r-   rL   rN   rO   rP   )rU   modules     r"   r   z&DiffLlamaPreTrainedModel._init_weights  s    %%dF3f01LL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HI	 2r!   N)r   r   r   _supports_flex_attn_supports_attention_backendrJ   no_gradr   r    r!   r"   r   r     s*    "'U]]_J Jr!   r   c                       e Zd Zy)DiffLlamaModelNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForCausalLMNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)"DiffLlamaForSequenceClassificationNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForQuestionAnsweringNr   r    r!   r"   r   r     r#   r!   r   c                       e Zd Zy)DiffLlamaForTokenClassificationNr   r    r!   r"   r   r     r#   r!   r   )r   r   r   r   r   r   )5r%   rJ   r    r   r   cache_utilsr   r   modeling_flash_attention_utilsr   r	   modeling_utilsr
   utilsr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_diffllamar   
get_loggerr   r6   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r(   r*   Moduler,   r   r   r   r   r   r   r   r   r   r   __all__r    r!   r"   <module>r      s!  "    & - i -  3
 
 
 2 4 
		H	%5 #	: 	2	3 	_) _)DD!1 D!NJ!/ J!\  1" v- vJ3 J	Z 		+ 		)G 		$= 		&A 	r!   