
    i1                        d dl mZ d dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z! ddl"m#Z#  ejH                  e%      Z&dZ'dZ( G d de      Z) G d de      Z* G d de      Z+ G d de      Z, G d de      Z- G d de      Z. G d d e      Z/ G d! d"e      Z0g d#Z1y)$    )Callable)OptionalN   )CacheDynamicCache)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )CLIPMLP)LlamaAttentionLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )	PhiConfigzmicrosoft/phi-1r   c                   R    e Zd Ze	 	 	 d	dedz  ded   dedz  dedef   fd       Z	y)
PhiRotaryEmbeddingNconfigdeviceztorch.deviceseq_lenreturnztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   r   )dtype)r   r%   )rope_parametersgetgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r   r   r   baser#   r$   dimattention_factorinv_freqs	            _/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/phi/modular_phi.pycompute_default_rope_parametersz2PhiRotaryEmbedding.compute_default_rope_parameters%   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))    )NNN)
__name__
__module____qualname__staticmethodr   r   r+   tupler0   r6    r7   r5   r   r   $   sY    #'+/"*D *(* t* 
~u$	%	* *r7   r   c                       e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
eej                  ej                  dz  f   fdZ xZS )PhiAttentionr   	layer_idxc                    t         |   ||       t        j                  |j                  |j
                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _	        t        j                  |j                  |j                  | j                  z  d      | _
        t        j                  |j
                  | j                  z  |j                  d      | _        | `t        | j                  |j                  d   z        | _        |j                   | _        | j                   r}t        j"                  |j                  |j
                  z  |j$                  d      | _        t        j"                  |j                  |j
                  z  |j$                  d      | _        y y )NTbiasr#   )epselementwise_affine)super__init__nnLinearr)   r*   r$   q_projnum_key_value_headsk_projv_projdenseo_projr+   r&   rotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfr   r@   	__class__s      r5   rG   zPhiAttention.__init__G   sh   +ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^eijYYv99DMMI6K]K]dhi
K0F0FG^0_ _`"//!||""f&@&@@fF[F[pt D  "||""f&@&@@fF[F[pt D	 r7   Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionr    c                 p   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  r"| j                  |	      }	| j                  |
      }
|\  }}|	dd | j                  f   |	d| j                  d f   }}|
dd | j                  f   |
d| j                  d f   }}t        ||||      \  }}t        j                  ||fd      }	t        j                  ||fd      }
|'|||d}|j                  |
|| j                  |      \  }
}t!        j"                  | j$                  j&                  t(              } || |	|
||f| j*                  sdn| j,                  | j.                  d|\  }} |j0                  g |d j3                         }| j5                  |      }||fS )	Nr   r   .)r2   )sincosr]   g        )dropoutscaling)shaper$   rJ   view	transposerL   rM   rQ   rT   rU   rP   r   r,   catupdater@   r   get_interfacer   _attn_implementationr   trainingattention_dropoutrc   reshape
contiguousrN   )rW   rY   rZ   r[   r\   r]   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesra   r`   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfaceattn_outputattn_weightss                         r5   forwardzPhiAttention.forwardX   se    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST++L9L))*5J&S 1 1 1112d//112 	
 s/d////0sD--//0 
 2)Wc3O	7 yy)Z!8bAYY2;
&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHjj-L((r7   )NN)r8   r9   r:   r   r+   rG   r,   Tensorr<   r   
LongTensorr}   __classcell__rX   s   @r5   r?   r?   F   s    y S , )-26;)||;) #5<<#=>;) t+	;)
 ;) ((4/;) 
u||U\\D00	1;)r7   r?   c                       e Zd Zy)PhiMLPNr8   r9   r:   r=   r7   r5   r   r          r7   r   c                   f    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
e
dz  dej                  dz  deej                  ej                  f   dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )PhiDecoderLayerr   r@   c                    t         |           t        ||      | _        t	        |      | _        t        j                  |j                  |j                        | _
        t        j                  |j                        | _        y )N)r@   rD   )rF   rG   r?   	self_attnr   mlprH   rR   r)   rS   input_layernormDropoutresid_pdropresid_dropoutrV   s      r5   rG   zPhiDecoderLayer.__init__   s]    %f	B&>!||F,>,>FDYDYZZZ(:(:;r7   NrY   r[   position_idsr\   output_attentions	use_cacher]   rZ   r    c	                     |}
| j                  |      } | j                  d||||||||d|	\  }}| j                  |      }| j                  | j                  |            }||z   |
z   }|f}|r||fz  }|S )N)rY   r[   r   r\   r   r   r]   rZ   r=   )r   r   r   r   )rW   rY   r[   r   r\   r   r   r]   rZ   ro   residualattn_outputsself_attn_weightsfeed_forward_hidden_statesoutputss                  r5   r}   zPhiDecoderLayer.forward   s     !,,]; +9$.. 
+
')%+/) 3
+
 
+
'' )),7%)%7%78O%P"$'AAHL ")++Gr7   )NNNFFNN)r8   r9   r:   r   r+   rG   r,   r~   r   r   boolr<   FloatTensorr}   r   r   s   @r5   r   r      s    <y <S < /304(,).!&26HL%||% t+% &&-	%
 %  $;% $;% ((4/% #5<<#=>E% 
u  %(9(95;L;L(L"MPT"TT	U%r7   r   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
e
dz  de
dz  dej                  dz  dee   defdZ xZS )PhiModelr   c           	      d   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        | `y c c}w )Nr   )rF   rG   rH   
ModuleListrangenum_hidden_layersr   layersr   
embd_pdropembed_dropoutrR   r)   rS   final_layernormnormrV   s      r5   rG   zPhiModel.__init__   s     mmAFvG_G_A`aI_VY/a
  ZZ(9(9:!||F,>,>FDYDYZI	 bs   B-N	input_idsr[   r   r\   inputs_embedsr   r   output_hidden_statesr]   ro   r    c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }|	F||j                         nd}t        j                  |||j                  d   z   |j                        }	||	j!                  d      }t#        | j                   |||	||      }| j%                  |      }|}| j'                  ||	      }|rd
nd }|rd
nd }| j(                  d | j                   j*                   D ],  }|r||fz  } ||f||||||	|d|
}|d   }|s$||d   fz  }. | j-                  |      }|r||fz  }t/        ||r|nd ||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)r   r   r   )r   )r   r   r[   r]   r\   r   )r   r=   )r[   r   r\   r   r   r]   rZ   )last_hidden_stater\   rY   
attentions)r   r   r   r   
ValueErrorgradient_checkpointingrk   loggerwarning_onceembed_tokensr   get_seq_lengthr,   r-   rd   r   	unsqueezer   r   
rotary_embr   r   r   r
   )rW   r   r[   r   r\   r   r   r   r   r]   ro   past_seen_tokenscausal_maskrY   rZ   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r5   r}   zPhiModel.forward   sP    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 **=9%"oom,oW #7BD0d![[)H4;;+H+HI 	6M#!m%55!)
*) /"3#-$7
 
M *!,M =#3"55'	6* ,,];  -!11&+/8Od+%	
 	
r7   )	NNNNNNNNN)r8   r9   r:   r   rG   r,   r   r~   r   r   r   r   r   r
   r}   r   r   s   @r5   r   r      s    y  .2.204(,26!%)-,026\
##d*\
 t+\
 &&-	\

 \
 ((4/\
 $;\
  $;\
 #Tk\
 ((4/\
 +,\
 
!\
r7   r   c                        e Zd Z fdZ xZS )PhiForCausalLMc                     t         |   |       t        j                  |j                  |j
                  d      | _        y )NTrB   )rF   rG   rH   rI   r)   
vocab_sizelm_head)rW   r   rX   s     r5   rG   zPhiForCausalLM.__init__4  s0     yy!3!3V5F5FTRr7   )r8   r9   r:   rG   r   r   s   @r5   r   r   3  s    S Sr7   r   c                       e Zd Zy)PhiForSequenceClassificationNr   r=   r7   r5   r   r   9  r   r7   r   c                       e Zd Zy)PhiForTokenClassificationNr   r=   r7   r5   r   r   =  r   r7   r   )PhiPreTrainedModelr   r   r   r   )2collections.abcr   typingr   r,   torch.nnrH   cache_utilsr   r   masking_utilsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   processing_utilsr   utilsr   r   clip.modeling_clipr   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_phir   
get_loggerr8   r   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r?   r   r   r   r   r   r   __all__r=   r7   r5   <module>r      s    $    . / 9 6 & 0 (	 	 	 ) 
		H	%' *- *DM)> M)`	W 	-0 -`f
z f
RS% S	#A 		 ; 	r7   