
    謜i*Z                       U d Z ddlmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlZddlmZ ddlmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZmZ d	dlmZmZmZmZmZm Z m!Z!m"Z"m#Z# erd	dl$m%Z% d	dlm&Z&  ejN                  e(      Z)e	 G d d             Z*d)dZ+ee*ge,ee-e.ef   f   f   Z/d*dZ0	 d+	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d,dZ1d-dZ2d-dZ3d-dZ4d-dZ5d-dZ6d-dZ7d-dZ8d-dZ9d-dZ:d-dZ;d-d Z<d-d!Z=d-d"Z>d-d#Z?d-d$Z@d-d%ZAd-d&ZBej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  gZRej                  ej                  ej                  ej                  ej                  ej                  gZYej                  ej                  gZ\ej                  ej                  gZ_ej                  ej                  ej                  gZcej                  e2ej                  e3ej                  e3ej                  e4ej                  e5ej                  e6ej                  e8ej                  e9ej                  e:ej                  e;ej                  e?ej                  eBej                  e>ej                  e>ie-j                  eRe7      e-j                  eYe<      e-j                  e\e=      e-j                  e_e@      e-j                  eceA      Zsd'etd(<   y).z,
Optimizer utilities for the Trainer class.
    )annotationsN)Callable)	dataclass)TYPE_CHECKINGAny)version)nn   )	Adafactor)LayerWiseDummyOptimizer)check_target_module_exists)OptimizerNamesParallelMode)	is_apollo_torch_availableis_bitsandbytes_availableis_galore_torch_availableis_grokadamw_availableis_lomo_availableis_schedulefree_availableis_torch_optimi_availableis_torchao_available	strtobool)PreTrainedModel)TrainingArgumentsc                  D    e Zd ZU dZded<   ded<   ded<   ded<   d	ed
<   y)OptimizerContextz0Context object passed to all optimizer handlers.r   argszPreTrainedModel | Nonemodeldict[str, Any]optimizer_kwargsadam_kwargsdict[str, str]
optim_argsN)__name__
__module____qualname____doc____annotations__     Z/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/trainer_optimizer.pyr   r   6   s#    :
!!$$r*   r   c                    | si S i }| j                  dd      j                  d      D ]  }|j                  d      \  }}|||<    |S )z8Parse optimizer arguments from a comma-separated string.  ,=)replacesplit)optim_args_strr#   mappingkeyvalues        r+   _parse_optim_argsr7   A   sY    	J!))#r288=  ]]3'
U
3  r*   c                n    t        | t              r%t        | t        j                  j
                        ryy)a4  
    Check if the returned value from a handler is a factory rather than an Optimizer class.

    Factory callables are used for complex optimizers like Muon or Dion that need to:
    - Split parameters between multiple internal optimizers
    - Handle complex sharding logic
    - Access the full model structure for parameter grouping

    Args:
        optimizer_cls_or_factory: The first element returned by an optimizer handler.

    Returns:
        `bool`: True if it's not an Optimizer class (i.e., likely a factory), False if it's an Optimizer class.
    FT)
isinstancetype
issubclasstorchoptim	Optimizer)optimizer_cls_or_factorys    r+   is_optimizer_factoryr@   P   s,      *D1jAY[`[f[f[p[p6qr*   c                   |j                         j                  d      }|r.| j                  t        j                  k(  r|rt        d| d      ||   }| j                  t        d| d      t        | j                  t        t        f      st        d| j                         |t        d| d      t        | j                  t              xr | j                  j                  d	d
      dk(  }	g }
|j                         D ]s  \  }}t        | j                  |d      \  }}t        |t        j                         s |r|st"        j%                  | d| d       [|s|	s`|
j'                  |dz          u t)        |
      dk(  rt        d| d| j                   d      |j+                         D cg c]  \  }}||
v s| }}}|j+                         D cg c]  \  }}||
vs| }}}d|id|i|g}|r| j,                  dk7  rt        d| d      i |D ]  } |d|gigfi ||<    |D ]  } |d|gi|gfi ||<    fd}|j/                         D ]   }|j0                  s|j3                  |       " t4        }|j7                  di       |j7                  d|i       ||fS c c}}w c c}}w )z
    Helper function to set up low-rank optimizers like GaLore and Apollo.

    These optimizers apply low-rank projections to specific target modules (typically linear layers).
    	layerwisezLayer-wise z" does not support DDP at this timez1You need to define `optim_target_modules` to use z optimizerszX`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: z'You need to pass a model to initialize z optimizer._-z
all-linearT)return_is_regexz matched but ignored. z only supports linear layers.z.weightr   zNo target modules found for z (z).paramsr
   z
Layerwise z( does not support gradient accumulation!c                l    | j                   '|    j                          |    j                          y y )N)gradstep	zero_grad)paramoptimizer_dicts    r+   optimizer_hookz1_setup_low_rank_optimizer.<locals>.optimizer_hook   s4    zz%u%**,u%//1 &r*   rL   )lowerendswithparallel_moder   DISTRIBUTEDNotImplementedErroroptim_target_modules
ValueErrorr9   liststr	TypeErrorr1   named_modulesr   r	   Linearloggerwarningappendlennamed_parametersgradient_accumulation_steps
parametersrequires_grad"register_post_accumulate_grad_hookr   update)r   r   optimizer_nameoptimizer_mappingoptim_kwargsr    is_layerwise_supportedis_layerwiseoptimizer_cls
all_lineartarget_params_namesmodule_namemoduletarget_module_existsis_regexnptarget_paramsnon_target_paramsparam_groupsrK   rM   rL   s                         @r+   _setup_low_rank_optimizerru   e   sI    "'')22;?L**l.F.FFKa!K/??a"bcc%n5M  (L^L\\ghiid//$=--.0
 	

 }B>BRR]^__ 	4,,c2rt7P7P7X7XY\^a7bfr7r  $224 <V)C%%{D*
&h &")),#H+.D^DTTqrs#J"";#:;< 1$77Gr$JcJcIddfghh#(#9#9#;X41aqDW?WQXMX','='='?`tq!1L_C_`` 
$%	=1L1L
 ++q0z.)99abcc& 	]E$1Hug3F2G$\K[$\N5!	]" 	mE$1Hug3V3V2W$l[k$lN5!	m	2
 %%' 	IE""88H	I 0!1> BCX|45***? Y`s   :J9J9!J?.J?c                b    | j                   j                  ddd       t        | j                   fS )zGet Adafactor optimizer.Fscale_parameterrelative_step)r    rc   r   ctxs    r+   _get_adafactorr|      s,    EE RSc****r*   c                    ddl m} | j                  j                  | j                         | j
                  j                  t        j                  k(  r| j                  j                  ddi       || j                  fS )z/Get PyTorch AdamW optimizer (regular or fused).r   AdamWfusedT)	torch.optimr   r    rc   r!   r   r=   r   ADAMW_TORCH_FUSEDr{   r   s     r+   _get_adamw_torchr      s\    !0
xx~~999##WdO4#&&&&r*   c                    	 ddl m} | j                  j                  | j                         || j                  fS # t
        $ r t        d      w xY w)z'Get Torch XLA syncfree AdamW optimizer.r   r~   z7Trainer failed to import syncfree AdamW from torch_xla.)torch_xla.amp.syncfreer   r    rc   r!   ImportErrorrT   r   s     r+   _get_adamw_torch_xlar      sQ    T0##COO4c**** TRSST	   8; Ac                    	 ddl m} | j                  j                  | j                         || j                  fS # t
        $ r t        d      w xY w)zGet NPU Fused AdamW optimizer.r   )NpuFusedAdamWz3Trainer failed to import FusedAdamW from torch_npu.)torch_npu.optimr   r    rc   r!   r   rT   )r{   r   s     r+   _get_adamw_torch_npu_fusedr      sQ    P1##COO4c2222 PNOOPr   c                    	 ddl m} | j                  j                  | j                         || j                  fS # t
        $ r t        d      w xY w)zGet Apex Fused Adam optimizer.r   )	FusedAdamzFTrainer tried to instantiate apex FusedAdam but apex is not installed!)apex.optimizersr   r    rc   r!   r   rT   )r{   r   s     r+   _get_adamw_apex_fusedr      sQ    c-##COO4#.... cabbcr   c                   t               st        d      ddlm}m}m} | j                  j                  }d|v }d|v rdnd}d}| j                  }d	|v r|}nd
|v r2|}d| j                  j                  | j                  j                  fi}ngd|v r|}| j                  }nSd|v rNddlm}	 |	}t        | j                  j                  d| j                  j                              t        | j                  j                  d| j                  j                              t        | j                  j                  dd            ft        | j                  j                  dd            t        | j                  j                  d| j                  j                              d}d| j                  v rt!        | j                  d         |d<   d| j                  v rt!        | j                  d         |d<   d|i}
d|vr||
d<   | j"                  j%                  |       | j"                  j%                  |
       || j"                  fS )z;Get bitsandbytes optimizer (AdamW, Lion, RMSprop variants).ziYou need to install `bitsandbytes` in order to use bitsandbytes optimizers: `pip install -U bitsandbytes`r   )r   LionRMSproppaged8bit       Nadamlionbetasrmspropademamix)AdEMAMixbeta1beta2beta3gH.?alphag      @eps)r   r   r   t_alphat_beta3
optim_bitsis_paged)r   r   bitsandbytes.optimr   r   r   r   r=   r!   
adam_beta1
adam_beta2r#   r   floatgetadam_epsilonintr    rc   )r{   r   r   r   
optim_namer   r   ri   additional_optim_kwargsr   
bnb_kwargss              r+   _get_bitsandbytes_optimizerr      s   $&w
 	
 87J*$H
*JM!oo	:	#*SXX-@-@#((BUBU,V"W	j	 "%..	z	!/  cnn((#((2E2EFGcnn((#((2E2EFGcnn((&9:
 3>>--gs;<++E3883H3HIJ#
 &14S^^I5N1O#I.&14S^^I5N1O#I.
+J
"!)
: 78
+#....r*   c                   	 ddl m} | j                  j                  | j                         | j                  j                  t        | j                  j                  dd            t        t        | j                  j                  dd            t        t        | j                  j                  dd            t        t        | j                  j                  dd	            d
       || j                  fS # t        $ r t        d      w xY w)z!Get AnyPrecision AdamW optimizer.r   )AnyPrecisionAdamWuse_kahan_summationFalsemomentum_dtypefloat32variance_dtypecompensation_buffer_dtypebfloat16)r   r   r   r   z4Please install https://github.com/pytorch/torchdistx)torchdistx.optimizersr   r    rc   r!   r   r#   r   getattrr<   r   rT   )r{   r   s     r+   _get_adamw_anyprecisionr   &  s    Q;##COO4##'01C1CDY[b1c'd")%1C1CDTV_1`"a")%1C1CDTV_1`"a-43>>--.I:V.			
 !#"6"666 QOPPQs   C3C6 6Dc                N    t         j                  j                  | j                  fS )zGet SGD optimizer.)r<   r=   SGDr    rz   s    r+   _get_sgdr   ;  s    ;;??C0000r*   c                N    t         j                  j                  | j                  fS )zGet Adagrad optimizer.)r<   r=   Adagradr    rz   s    r+   _get_adagradr   @      ;; 4 444r*   c                N    t         j                  j                  | j                  fS )zGet RMSprop optimizer.)r<   r=   r   r    rz   s    r+   _get_rmspropr   E  r   r*   c                    t               st        d      ddlm}m}m} t        j                  |t        j                  |t        j                  |t        j                  |t        j                  |t        j                  |i}t        | j                  j                  dd            t        | j                  j                  dd            t!        | j                  j                  dd	            | j                  j                  d
d      d}t#        | j$                  | j&                  | j$                  j(                  ||| j*                        \  }}| j$                  j(                  t        j                  k(  r|j-                  ddd       ||fS )zGet GaLore optimizer.zYou need to install `galore_torch` in order to use GaLore optimizers. Install it with `pip install git+https://github.com/jiaweizzhao/GaLore`r   )GaLoreAdafactorGaLoreAdamWGaLoreAdamW8bitrank   update_proj_gap   scaleg      ?	proj_typestd)r   r   r   r   Frw   )r   r   galore_torchr   r   r   r   GALORE_ADAMWGALORE_ADAMW_8BITGALORE_ADAFACTORGALORE_ADAMW_LAYERWISEGALORE_ADAMW_8BIT_LAYERWISEGALORE_ADAFACTOR_LAYERWISEr   r#   popr   ru   r   r   r=   r    rc   )r{   r   r   r   re   galore_optim_kwargsri   r    s           r+   _get_galore_optimizerr   J  sJ   $&V
 	
 KJ 	##[((/''--{22O11? CNN&&vs34s~~112CSIJs~~))'489^^''U;	 '@#))SXX^^->@SUXUiUi'#M# xx~~888EE RS***r*   c           
        t               st        d      ddlm} t        j
                  |t        j                  |i}t        | j                  j                  dd            | j                  j                  dd      | j                  j                  dd	      t        | j                  j                  d
d            t        | j                  j                  dd            | j                  j                  dd      d}|j                  | j                         t        | j                  | j                  | j                  j                   ||| j"                        S )zGet Apollo optimizer.zYou need to install `apollo_torch` in order to use APOLLO optimizers. Install it with `pip install git+https://github.com/zhuhanqing/APOLLO`r   )APOLLOAdamWr   r   projrandom
scale_typechannelr   r   r         ?r   r   )r   r   r   r   r   r   )r   r   apollo_torchr   r   APOLLO_ADAMWAPOLLO_ADAMW_LAYERWISEr   r#   r   r   rc   r!   ru   r   r   r=   r    )r{   r   re   apollo_optim_kwargss       r+   _get_apollo_optimizerr   k  s   $&U
 	
 ) 	##[--{ CNN&&vs34""684nn((yAs~~112CSIJs~~))'378^^''U; s/$#))SXX^^->@SUXUiUi r*   c                   t               st        d      | j                  t        d      ddlm}m} d| j                  j                  v r|n|}| j                  j                  d| j                  i       || j                  fS )zGet LOMO optimizer.zjYou need to install `lomo_optim` in order to use LOMO optimizers. Install it with `pip install lomo-optim`zMYou need to pass a `model` in order to correctly initialize a LOMO optimizer.r   )AdaLomoLomoadar   )r   r   r   rT   
lomo_optimr   r   r   r=   r    rc   )r{   r   r   ri   s       r+   _get_lomo_optimizerr     sy    7
 	

 yyhii($6GDM#)) 45#....r*   c                   t               st        d      ddlm} | j                  j                  t        | j                  j                  dd            t        | j                  j                  dd            t        | j                  j                  dd	            t        | j                  j                  d
d	            t        | j                  j                  dd            d       || j                  fS )zGet GrokAdamW optimizer.z5Please install grokadamw with `pip install grokadamw`r   )	GrokAdamW
alpha_initg\(\?lamb       @gammag?grokking_signal_decay_rategradient_clippingr   )r   r   r   r   r   )	r   rT   	grokadamwr   r    rc   r   r#   r   )r{   r   s     r+   _get_grokadamwr     s    !#PQQ# 2 2< FG#..,,VS9:3>>--gs;<*/0B0BC_ad0e*f!&s~~'9'9:Ms'S!T	
 c****r*   c           	        t               rHt        j                  t        j                  j                  d            t        j                  d      k  rt        d      t        j                  t        j                  j                  d            t        j                  d      k  rt        d      t        j                  t        j                  j                  d            t        j                  d      k\  r	dd	lm}m} ndd	l	m}m} | j                  j                  t        j                  k(  r|}n|}| j                  j                  | j                   j#                  d
d      t%        | j                   j#                  dd            d       | j                  j                  | j&                         || j                  fS )z%Get TorchAO 4-bit or 8-bit optimizer.torchaoz0.4.0zYou need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers. Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/aor<   z2.4zYou need to have `torch>2.4` in order to use torch 4-bit optimizers. Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly.z0.11.0r   )	AdamW4bit	AdamW8bit
block_size   bf16_stochastic_roundr   )r   r   )r   r   parse	importlibmetadatar   torchao.optimr   r   torchao.prototype.low_bit_optimr   r=   r   ADAMW_TORCH_4BITr    rc   r#   r   r   r!   )r{   r   r   ri   s       r+   _get_torchao_optimizerr    sP   !W]]93E3E3M3Mi3X%Y\c\i\ijq\r%r,
 	

 }}Y''//89W]]5=QQ<
 	
 }}Y''//	:;w}}X?VV66H
xx~~888!!..,,\3?%.s~~/A/ABY[b/c%d	
 0#....r*   c           	     0   t               st        d      ddlm}m} i }d}| j
                  j                  t        j                  k(  r-t        d      st        d      ddlm	} |}| j                  }d}nk| j
                  j                  t        j                  k(  r|}| j                  }n5| j
                  j                  t        j                  k(  r|}nt        d	      | j
                  j                  |d
<   |r| j
                  j                  |d<   |j!                  t#        | j$                  j'                  dd            t#        | j$                  j'                  dd            d       | j(                  j!                  |       || j(                  fS )zGet ScheduleFree optimizer.zwYou need to install `schedulefree` in order to use schedulefree optimizers. Install it with `pip install schedulefree.`r   )AdamWScheduleFreeSGDScheduleFreeTz1.4.0zYou need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. Install it with `pip install schedulefree.`)RAdamScheduleFreeFzInvalid schedulefree optimizerweight_decaywarmup_stepsweight_lr_powerr   rg        )r  r  )r   r   schedulefreer  r	  r   r=   r   SCHEDULE_FREE_RADAMr
  r!   SCHEDULE_FREE_ADAMWSCHEDULE_FREE_SGDrT   r  r  rc   r   r#   r   r    )r{   r  r	  r   require_warmupr
  ri   s          r+   _get_schedule_free_optimizerr    s[   $&:
 	
 @ N
xx~~;;;(1>  	3)"%//	>==	=)"%//	>;;	;'9::.1hh.C.CN+25((2G2G/""$S^^%7%78I3%OPs~~))#s34	
  78#....r*   c                   t               st        d      ddlm} | j                  j                  dd      }|t        |      }| j                  j                  dd      }|t        |      }| j                  j                  | j                  d<   t        | j                  j                  dd	            ||d
}| j                  j                  | j                         | j                  j                  |       || j                  fS )z,Get StableAdamW optimizer from torch-optimi.zwYou need to install `torch-optimi` in order to use stable_adamw optimizers. Install it with `pip install torch-optimi`.r   )StableAdamWmax_lrN	kahan_sumr  decouple_lrF)r  r  r  )r   r   optimir  r#   r   r   boolr   r  r!   r    rc   )r{   r  r  r  stable_adamw_kwargss        r+   _get_stable_adamwr    s    $&:
 	
 #^^$/Fv"";5IO	&)hh&;&;COON#CNN..}eDE 0 34,,,,r*   zdict[str, OptimizerHandler]_OPTIMIZER_HANDLERS)r3   z
str | Nonereturnr"   )r?   r   r  r  )T)r   r   r   r   rd   rV   re   r   rf   r   r    r   rg   r  r  tuple[Any, dict[str, Any]])r{   r   r  r   )ur'   
__future__r   importlib.metadatar  loggingcollections.abcr   dataclassesr   typingr   r   r<   	packagingr   r	   optimizationr   trainer_pt_utilsr   trainer_utilsr   training_argsr   r   utilsr   r   r   r   r   r   r   r   r   modeling_utilsr   r   	getLoggerr$   rZ   r   r7   tupledictrV   OptimizerHandlerr@   ru   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  	ADAMW_BNB
ADAMW_8BITPAGED_ADAMWPAGED_ADAMW_8BITADEMAMIXADEMAMIX_8BITPAGED_ADEMAMIXPAGED_ADEMAMIX_8BITLION	LION_8BIT
PAGED_LIONPAGED_LION_8BITRMSPROP_BNBRMSPROP_8BITRMSPROP_32BIT_BITSANDBYTES_OPTIMIZERSr   r   r   r   r   r   _GALORE_OPTIMIZERSr   r   _APOLLO_OPTIMIZERSr  ADAMW_TORCH_8BIT_TORCHAO_OPTIMIZERSr  r  r  _SCHEDULE_FREE_OPTIMIZERS	ADAFACTORADAMW_TORCHr   ADAMW_TORCH_XLAADAMW_TORCH_NPU_FUSEDADAMW_APEX_FUSEDADAMW_ANYPRECISIONr   ADAGRADRMSPROP	GROKADAMWSTABLE_ADAMWLOMOADALOMOfromkeysr  r(   r)   r*   r+   <module>rT     s   #   $ ! %    # 5 5 7
 
 
 /0			8	$    -.c4S>6I0JJK 8 $(V+
V+V+ V+ &	V+
 !V+ %V+ !V+  V+|+'TPc//dQ*1
5
5
+B:/$+& /F)/X-D ##  !!&&""   & $$##))..--  ))  ####  &&&&$$  n 0$$&6""$8((*D##%:%%'>LLn!2,/4 mm,.IJ4  mm&(=>!4" mm&(=>#4$ mm')?@%4& mm-/KL'4 0 r*   