
    謜i*                        d dl Z d dlZd dlmZ d dlmZ d dlZej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                   ej                  j                  j"                  ej                  j                  j$                  ej                  j                  j&                  ej                  j                  j(                  ej                  j                  j*                  ej                  j                  j,                  dZ	 d)dej0                  dededej4                  dz  d	ej0                  f
d
Z		 d)dej0                  dededej4                  dz  d	ej0                  f
dZ
dej0                  ded	ej0                  fdZdej0                  d	ej0                  fdZdej0                  d	ej0                  fdZdej0                  d	ej0                  fdZd*dej0                  ded	ej0                  fdZd+dej0                  dedej4                  dz  d	ej0                  fdZd+dej0                  dedej4                  dz  d	ej0                  fdZ	 	 	 	 d,dej0                  dedededej4                  dz  d	ej0                  fdZ	 	 	 	 d,dej0                  dedededej4                  dz  d	ej0                  fdZ	 	 	 	 	 d-dej0                  dededededej4                  dz  d	ej0                  fdZ	 	 d.dej0                  dedej4                  dz  d	ej0                  fdZ	 d/dej0                  dededej4                  dz  d	ej0                  f
dZdej0                  d ej0                  d	ej0                  fd!Zd0d"Zd# Zd$ Z d%Z!ed&        Z"ed'        Z#ed(        Z$y)1    N)defaultdict)contextmanager)uniform_normal_	constant_ones_zeros_eye_dirac_xavier_uniform_xavier_normal_kaiming_uniform_kaiming_normal_trunc_normal_orthogonal_sparse_tensorab	generatorreturnc                 D    t        | dd      st        d   | |||      S | S )N_is_hf_initializedFr   )r   r   r   getattrTORCH_INIT_FUNCTIONS)r   r   r   r   s       W/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/initialization.pyr   r   *   s,     6/7#J/!qIVVM    meanstdc                 D    t        | dd      st        d   | |||      S | S )Nr   Fr   )r   r    r   r   )r   r   r    r   s       r   r   r   2   s-     6/7#I.vDcU^__Mr   valc                 @    t        | dd      st        d   | |      S | S )Nr   Fr   )r"   r   )r   r"   s     r   r   r   :   s&    6/7#K0SAAMr   c                 <    t        | dd      st        d   |       S | S )Nr   Fr   r   r   s    r   r   r   @   s$    6/7#G,V44Mr   c                 <    t        | dd      st        d   |       S | S )Nr   Fr	   r   r%   s    r   r	   r	   F   s$    6/7#H-f55Mr   c                 <    t        | dd      st        d   |       S | S )Nr   Fr
   r   r%   s    r   r
   r
   L   s$    6/7#F+F33Mr   groupsc                 @    t        | dd      st        d   | |      S | S )Nr   Fr   )r(   r   )r   r(   s     r   r   r   R   s&    6/7#H-fVDDMr   gainc                 B    t        | dd      st        d   | ||      S | S )Nr   Fr   r*   r   r   r   r*   r   s      r   r   r   X   s*    6/7#$56vDT]^^Mr   c                 B    t        | dd      st        d   | ||      S | S )Nr   Fr   r,   r   r-   s      r   r   r   ^   s*    6/7#$45f4S\]]Mr   modenonlinearityc                 F    t        | dd      st        d   | ||||      S | S )Nr   Fr   r   r/   r0   r   r   r   r   r/   r0   r   s        r   r   r   d   s6     6/7#$67ad
 	
 Mr   c                 F    t        | dd      st        d   | ||||      S | S )Nr   Fr   r2   r   r3   s        r   r   r   r   s6     6/7#$56ad
 	
 Mr   c                 H    t        | dd      st        d   | |||||      S | S )Nr   Fr   )r   r    r   r   r   r   )r   r   r    r   r   r   s         r   r   r      s3     6/7#O4V$CSTXYenooMr   c                 B    t        | dd      st        d   | ||      S | S )Nr   Fr   r,   r   r-   s      r   r   r      s+    
 6/7#M26PYZZMr   sparsityc                 D    t        | dd      st        d   | |||      S | S )Nr   Fr   )r7   r    r   r   )r   r7   r    r   s       r   r   r      s-     6/7#I.vc]fggMr   otherc                     t        | dd      s/t        j                         5  | j                  |      cd d d        S | S # 1 sw Y   | S xY w)Nr   F)r   torchno_gradcopy_)r   r9   s     r   r=   r=      sC    6/7]]_ 	'<<&	' 	'M	'Ms	   ?A	c                    t         j                  j                  j                  |       \  }}|dk(  r|}n|dk(  r|}n|dk(  r||z   dz  }dz  }|dk(  r$t	        | t        j                  |      dz         y |d	k(  r!t        | t        j                  |             y |d
k(  r't        j                  d|z        }t        | | |       y t        d|       )Nfan_infan_outfan_avg         ?truncated_normalg۶%?)r    normaluniform   zinvalid distribution )
r;   nninit_calculate_fan_in_and_fan_outr   mathsqrtr   r   
ValueError)r   r/   distributionr?   r@   denomvariancebounds           r   _variance_scalingrR      s    hhmmAA&IOFGx				'!Q&U{H))f$))H"58K"KL		!DIIh/0		"		!h,'%'0?@@r   c                 <    t        | dd      st        | dd       | S )Nr   Fr?   rD   r/   rN   r   rR   r%   s    r   lecun_normal_rV      s!    6/7&x>PQMr   c                 <    t        | dd      st        | dd       | S )Nr   Fr?   rE   rT   rU   r%   s    r   default_flax_embed_init_rX      s     6/7&xhGMr   )
ztorch.nn.initztorch.nn.modules.activationztorch.nn.modules.transformerztorch.nn.modules.linearztorch.nn.modules.lossztorch.nn.modules.batchnormztorch.nn.modules.convztorch.nn.modules.normalizationztorch.nn.modules.rnnztorch.nn.modules.sparsec            	   #   4  K   t        t              } 	 t        D ]x  }|t        j                  v st        j                  |   }t
        j                         D ]9  }t        ||      st        ||      | |   |<   t        ||t               |          ; z d | j                         D ]*  \  }}|j                         D ]  \  }}t        |||        , y# | j                         D ]*  \  }}|j                         D ]  \  }}t        |||        , w xY ww)a  
    Guard the `torch.nn.init` primitive functions to behave exactly like the functions in this file, i.e. be
    protected against the `_is_hf_initialized` flag to avoid re-init if the param was already loaded.

    Usually, all models are using the init from `transformers` which are already guarded, but just to make extra sure
    and for remote code, we also use this context manager.
    N)r   dictTORCH_MODULES_TO_PATCHsysmodulesr   keyshasattrr   setattrglobalsitems)	originalsmodule_namemodule	func_name	functionsfuncs         r   guard_torch_init_functionsri      s     D!I11 	IKckk)[1!5!:!:!< IIvy17>vy7Q	&))4	79Y3GHI	I 	 "+!2 	1FI#,??#4 1	4	401	1!2 	1FI#,??#4 1	4	401	1s'   DC 6C &2C >D?DDc            	   #   r  K   ddl m}  d }t        t              }	 t        D ]m  }|t
        j                  v st
        j                  |   }t        j                         D ].  }t        ||      st        ||      ||   |<   t        |||       0 o | j                  }|| _        d |j                         D ]*  \  }}|j                         D ]  \  }}t        |||        , || _        y# |j                         D ]*  \  }}|j                         D ]  \  }}t        |||        , | _        w xY ww)ac  
    Disable weight initialization both at the torch-level, and at the transformers-level (`init_weights`).
    This is used to speed-up initializing an empty model with deepspeed, as we do not initialize the model on meta device
    with deepspeed, but we still don't need to run expensive weight initializations as we are loading params afterwards.
       PreTrainedModelc                       y N argskwargss     r   
empty_funcz#no_init_weights.<locals>.empty_func       r   N)modeling_utilsrm   r   rZ   r[   r\   r]   r   r^   r_   r   r`   init_weightsrb   )	rm   rt   rc   rd   re   rf   original_init_weightsrg   rh   s	            r   no_init_weightsry      sB     0 D!I=1 	?Kckk)[1!5!:!:!< ?Ivy17>vy7Q	&))4	:>?	? !0 < <'1$ "+!2 	1FI#,??#4 1	4	401	1 (=$	 "+!2 	1FI#,??#4 1	4	401	1 (=$s)   D7C. 6C. /:C. )AD7.AD44D7c               #   t   K   ddl m}  d }	 | j                  }|| _        d || _        y# | _        w xY ww)a  
    Disable weight tying during loading with `from_pretrained`. This is needed as we want to have access to ALL
    weights in the state_dict during `from_pretrained`, and otherwise tying them would remove them from it, as it's
    called in `post_init` when instantiating.
    rk   rl   c                       y ro   rp   rq   s     r   rt   z"no_tie_weights.<locals>.empty_func$  ru   r   N)rv   rm   tie_weights)rm   rt   original_tie_weightss      r   no_tie_weightsr~     s?      0;.::&0# ';#&:#s   
8, 8	58)        rC   N)rk   )rC   N)r   r?   
leaky_reluN)r   rC   g       g       @N)rk   N)g{Gz?N)r?   rE   )%rK   r\   collectionsr   
contextlibr   r;   rH   rI   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   Tensorfloat	Generatorintstrr=   rR   rV   rX   r[   ri   ry   r~   rp   r   r   <module>r      s    
 # %  &&xx}}$$((XX]]  hhmm""HHMMhhmm""xx}}44hhmm2266xx}}44XX]]0088==,,xx}}$$ & _cLL"-2EJ__W[E[
\\ dhLL %27JO//\`J`
\\ell  5<< %,, 5<< 5<< ELL  %,, 5<<  U\\ ELL  Z^H^ jojvjv 5<< u uY]G] iniuiu  $(,LL  	
 % \\  $(,LL  	
 % \\  (,
LL


 

 	

 
 %
 \\
 (,LL
 % \\	 cgLL$)05IN[_I_
\\%,, u||  A,  1 14 != !=H ; ;r   