
    謜i,                       U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dl mZ d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZm Z m!Z! d dl"m#Z# d dl$Z$d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z/ d dl$m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 ddl6m7Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZGmHZH ddlImJZJmKZKmLZLmMZM ddlNmOZOmPZPmQZQmRZRmSZSmTZTmUZU ddlVmWZW ddlXmYZY dd lZm[Z[ dd!l\m]Z] dd"l^m_Z_ dd#l`maZa dd$lbmcZc dd%ldmeZe dd&lfmgZg dd'lhmiZimjZjmkZkmlZlmmZmmnZnmoZo dd(lpmqZq dd)lrmsZsmtZt dd*lumvZv dd+lwmxZx dd,lymzZz dd-l{m|Z| dd.l}m~Z~ dd/lmZ dd0lmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ dd1lmZmZ dd2lmZmZmZ dd3lmZmZmZ dd4lmZmZ dd5lmZmZ dd6lmZ  e       rd d7lmZ d d8lmZ e$j                  je                         Z e       r7d dlmc m$Z d d9lmZ  e*jt                  e       e*jt                  d:      k\  Znd;Z ejx                  e      Zej~                  j                  d<d=      j                         Zej~                  j                  d>d=      j                         Z e d?d@A      Zd;ad;adBdCdDZ edEF       G dG dH             ZdI ZedJ        ZedK        ZeddLe$j                  dMedz  fdN       ZdO ZdP Ze$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  e$j                  dQZ	 ddReej                  z  dSee$j                  z  dTedUeee$j`                  f   fdVZdWe$j`                  dUefdXZdYe1j                  dUee   fdZZd[eee      d\eee$j`                  f   dUeeee      ee   f   fd]Zd[eee      d\eee$j`                  f   dUeeee      ee   f   fd^Zd\eee$j`                  f   d_d@dUeee$j`                  f   fd`Zd_d@daedWe$j`                  fdbZddceddedz  dUefdeZ	 	 ddfeej                  z  dz  ddedz  dgedz  dhedz  diedz  djedkedz  dledz  dUeee   dz  edz  f   fdmZ	 ddLee$j                  z  ez  dz  dnee   dz  doe:dpedz  d\edz  dTedqezdz  dUee:e$j                  f   fdrZ G ds dte      Z G du dv      Z G dw dx      Z G dy d@e1j                  eeeeJ      Z eej                        e_        ej                  j                  ;ej                  j                  j                  d_dzd{|      ej                  _        dd_e1j                  d}edUe1j                  fd~Zdeez  e$j                  z  dUefdZ	 dd_ededqezdz  fdZd_ededqezdz  fdZ G d de      Z e       Zee d<    G d de      Zy)    N)abstractmethod)defaultdict)CallableIterator)contextmanager)	dataclassfield)Enum)partialwraps)cycle)Thread)OptionalTypeVarget_type_hints)
is_zipfile)create_repois_offline_mode"split_torch_state_dict_into_shards)version)	safe_open)	save_file)Tensornn)constraints)
checkpoint   )initialization)PreTrainedConfig)get_model_conversion_mapping)WeightConverterWeightRenaming$convert_and_load_state_dict_in_modelrevert_weight_conversion)DistributedConfig)custom_object_save)CompileConfigGenerationConfig)PeftAdapterMixindeepspeed_configis_deepspeed_zero3_enabledis_fsdp_enabled)_get_device_mapaccelerate_disk_offloadaccelerate_dispatchcheck_and_set_device_mapexpand_device_map
get_deviceload_offloaded_parameter)!_load_state_dict_into_zero3_model)eager_paged_attention_forward)flash_attention_forward)paged_attention_forward)flex_attention_forward)	is_kernel)maybe_load_adapters)sdpa_attention_forward)sdpa_attention_paged_forward)ALL_PARALLEL_STYLES_get_parameter_tp_plandistribute_modelgather_state_dict_for_saveinitialize_tensor_parallelismshard_and_distribute_moduleverify_tp_plan)LOSS_MAPPING)lazy_import_flash_attention!lazy_import_paged_flash_attention)ROPE_INIT_FUNCTIONS)id_tensor_storage)HfQuantizer)get_hf_quantizer)get_module_from_name)auto_conversion)ADAPTER_SAFE_WEIGHTS_NAMEDUMMY_INPUTSSAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEContextManagersKernelConfigPushToHubMixincached_filecheck_torch_load_is_safe	copy_funchas_fileis_accelerate_availableis_bitsandbytes_availableis_env_variable_trueis_flash_attn_2_availableis_flash_attn_3_availableis_grouped_mm_availableis_kernels_availableis_torch_flex_attn_availableis_torch_mlu_availableis_torch_npu_availableis_torch_xpu_availablelogging)GeneralInterfaceis_flash_attention_requested)DownloadKwargscreate_and_tag_model_cardget_checkpoint_shard_files)#is_huggingface_hub_greater_or_equalis_sagemaker_mp_enabled
is_tracing)LoadStateDictInfolog_state_dict_report)_CAN_RECORD_REGISTRYOutputRecorder)QuantizationMethod)add_hook_to_module)extract_model_from_parallel)__version__z1.10FXLA_USE_BF160XLA_DOWNCAST_BF16SpecificPreTrainedModelTypePreTrainedModel)boundzkernels-community/flash-attn2z"kernels-community/vllm-flash-attn3)flash_attention_2flash_attention_3T)frozenc                   d   e Zd ZU dZdZedz  ed<    ee      Z	edz  ed<   dZ
edz  ed<   dZeed<   dZedz  ed	<   dZedz  ed
<   dZedz  ed<   dZeed<   dZej$                  dz  ed<    ee      Zeed<   dZedz  ed<   dZed   ed<   dZeed<   dZeeez     dz  ed<   edefd       Zy)LoadStateDictConfigze
    Config for loading weights. This allows bundling arguments that are just
    passed around.
    Npretrained_model_name_or_path)default_factorydownload_kwargsuse_safetensorsFignore_mismatched_sizessharded_metadata
device_mapdisk_offload_folderoffload_buffersdtype
dtype_planhf_quantizerz(torch.distributed.device_mesh.DeviceMeshdevice_meshTweights_onlyweight_mappingreturnc                     | j                   d uS N)r   selfs    W/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/modeling_utils.pyis_quantizedz LoadStateDictConfig.is_quantized   s      ,,    ) __name__
__module____qualname____doc__r   str__annotations__r	   rh   r   r   boolr   r   dictr   r   r   r   torchr   r   rI   r   r   r   r   listr!   r"   propertyr    r   r   r   r      s    
 15!3:4-2>-RO^d*R#'OTD['$)T)$(dTk("Jt"&*t*!OT! $E5;;$T2J2'+L+$+HLKDELL$DHND>9:TAH-d - -r   r   c                      t         j                  j                         xrL t         j                  j                         xr, t	        t
        j                  j                  dd            dk(  S )N
LOCAL_RANKz-1r   )r   distributedis_availableis_initializedintosenvirongetr   r   r   is_local_dist_rank_0r      sQ    &&( 	9,,.	9

|T23q8r   c               #   ,   K   da 	 d  da y # da w xY wwNTF)_is_quantizedr   r   r   set_quantized_stater      s      M    c               #   ,   K   da 	 d  da y # da w xY wwr   )_is_ds_init_calledr   r   r   set_zero3_stater      s"      #"Ur   r   model_class_namec              #   *  K   | j                   s&|| d|  d}t        |      d|  d}t        |      t        j                         }	 t        j                  |        d t        j                  |       y# t        j                  |       w xY ww)z
    Locally change the torch default dtype to `dtype`, and restore the old one upon exiting the context.
    If `model_class_name` is provided, it's used to provide a more helpful error message if `dtype` is not valid.
    Nz% cannot be instantiated under `dtype=z$` as it's not a floating-point dtypezCannot set `z7` as torch's default as it's not a floating-point dtype)is_floating_point
ValueErrorr   get_default_dtypeset_default_dtype)r   r   error_messageoriginal_dtypes       r   local_torch_dtyper      s      ""'#$$I%Ptu 
 '' +5'1hiM'',,.N0&//s   AB
A9 #B9BBc                      t        j                  g       j                  } t        j                         }| |k(  r|t        j                  d      k7  r|S y| S )z
    Test if a device context manager is currently in use, or if it is not the case, check if the default device
    is not "cpu". This is used to infer the correct device to load the model on, in case `device_map` is not provided.
    cpuN)r   tensordeviceget_default_device)device_in_contextdefault_devices     r   *get_torch_context_manager_or_global_devicer      sO    
 R(//--/NN*U\\%00!!r   c                     | j                         D ]   }|j                         s|j                  c S  t        |       dk(  rt        j
                  S t        t        | j                                     j                  S )zt
    Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype.
    r   )valuesr   r   lenr   float32nextiter)
state_dictts     r   get_state_dict_dtyper     sh        77N
 :!}}Z&&()*000r   )BOOLU8I8I16U16F16BF16I32U32F32F64I64U64F8_E4M3F8_E5M2checkpoint_filemap_locationr   r   c                 X   | j                  d      rt        | d      5 }i }|j                         D ]  }|dk(  rk|j                  |      }|j	                         }|t
        v r
t
        |   }nt        d|       t        j                  |j                         |d      ||<   s|j                  |      j                  |      ||<    |cddd       S |r
t                i }	t        | t              r|dk7  rt        |       rdd	i}	t        j                   | f||d
|	S # 1 sw Y   TxY w)zg
    Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.
    .safetensorspt)	frameworkmetaz)Cannot load safetensors of unknown dtype )sizer   r   NmmapTr   r   )endswithr   keys	get_slice	get_dtypestr_to_torch_dtyper   r   empty	get_shape
get_tensortorW   
isinstancer   r   load)
r   r   r   fr   k_slicek_dtyper   
extra_argss
             r   load_state_dictr   &  s.    /$7 	1JVVX 
E6)[[^F$..0G"44 27 ;(+TU\T])^__$)KKV5E5E5Gu]c$dJqM$%LLO$6$6|$DJqM
E 	 	   "J/3'LF,BzRaGbd^
::ojL|j_ijj/	 	s   B,D  D)r   c                     | j                         r5| j                  d      d   j                         | j                         z   }|S | j                         }|S )N)nelementviewdata_ptrelement_size)r   stops     r   _end_ptrr  H  sO    {{2r"++-0C0C0EE K  Kr   modulec           	          g }| j                         D ]L  \  }}t        |di       xs i }|j                  |j                         D cg c]  }|r| d| n| c}       N |S c c}w )N_tied_weights_keys.)named_modulesgetattrextendr   )r  tied_weight_keysname	submoduletiedr   s         r   _get_tied_weight_keysr  Q  sx    "$!//1 Uiy"6;Artyy{ S!$D61#A!= STU  !Ts   A$
tensorsr   c                 0   g }| D ]  }t        |      dk  r|j                  |       #g }|D ]2  }||   }|j                  |j                         t        |      |f       4 |j	                          |d   \  }}}	|j                  |	h       |dd  D ]4  \  }
}}|
|k\  r|j                  |h       n|d   j                  |       |}6  g }g }|D ]A  } t        |       dk(  r |j                  | j                                1|j                  |        C ||fS )N   r   r   r   )r   appendr   r  sortaddpop)r  r   filtered_tensorssharedareasr
  r   _	last_stop	last_namestartr   disjoint_tensorsshared_tensorss                 r   _find_disjointr  Y  sA    v;?##F+ 	FD%FLL&//+Xf-=tDE	F 	

"'(9i,!&qr 	E4	! ''/ $((.I	& N# +w<1##GKKM2!!'*	+
 +++r   c                 ^   g }g }| D ]  }t        |      dk  rt        j                  t              }|D ]A  }||   }|j                  |j                         t        |      f}||   j                  |       C t        |      dk(  r|j                  |       |j                  |        ||fS )Nr  r   )	r   collectionsr   setr   r   r  r  r  )	r  r   r  	identicalr  r  r
  r   areas	            r   _find_identicalr$  x  s    NI *v;?'', 	"D%FMM6??#4hv6FGD$KOOD!	" u:?V$!!&)* 9$$r   modelc                    t        j                  t              }| j                         D ]  \  }t	        |t
        j                        s|t        |         j                         >|j                  j                  dk(  r/|j                        }|t        |         j                         |t        |         j                          |j                         D ci c]  \  }}t        |      dkD  s|| }}}t        t        |            }g }t               }	|h|j!                         D ]U  }d}
t#        |      D ]C  t%        fd|D              }|s| v s|
dz  }
|
t        |      k  s3|	j'                         E W t)        |j!                         |       \  }}|D ]  |    j+                         | <    t-        ||       \  }}|D ]N  }|j/                  |	      }|D ]  | =  |j1                  |	      }t        |      dkD  s>|j                  |       P |r|j3                  |       t        |      dkD  rt5        d| d| d      | S c c}}w )aY  
    Remove all tied weights from the given `state_dict`, making sure to keep only the main weight that `model`
    will expect when reloading (even if we know tie weights symmetrically, it's better to keep the intended one).
    This is because `safetensors` does not allow tensor aliasing - so we're going to remove aliases before saving.
    r   r   r   c              3   J   K   | ]  }t        j                  |        y wr   research).0patr
  s     r   	<genexpr>z6remove_tied_weights_from_state_dict.<locals>.<genexpr>  s     %fsbiiT&:%fs    #z8The weights trying to be saved contained shared tensors z\ which are not properly defined. We found all the potential target tied weights keys to be: zo.
This can also just mean that the module's tied weight keys are wrong vs the actual tied weights in the model.)r   r   r   itemsr   r   r   idr  r   typeget_parameterrH   r   r!  r  r   sortedanyr  r  cloner$  intersection
differencer  RuntimeError)r   r%  ptrsr   ptrnamesshared_ptrsall_potential_tied_weights_keyserror_namesto_delete_namesfoundmatches_patternshared_namesdisjoint_namesidentical_namesinamesknownunknownr
  s                     @r   #remove_tied_weights_from_state_dictrG    s    ""4(D"((* 9f&%,,/ F##D)]]6) ((.FF##D) "6*+22489" 15

O*#uE
Q3:OKO '**?*F&G#KeO '2 '') 	2EEu 2"%%fFe%f"f"tz'9QJEs5z)'++D12	2 $2+2D2D2F
#S L.  4%d+113
44 %4L*$M!L/! (##O4 	!D4 	!##O4w<!w'( <(
;!F{m TJJiIj k||
 	
 c Ps   #I:I
param_namec                     t        | |      \  }}||j                  v r?t        |t        j                        s%t        j                  ||j                               }t        |||       y)zUCast a single parameter or buffer `param_name` into the `model`, with value `tensor`.)requires_gradN)rK   _parametersr   r   	Parameterr   setattr)r%  rH  r   parent
param_types        r   _load_parameter_into_modelrP    sU    -eZ@FJV'''
62<<0PfF4L4L4NO FJ'r   weights_namevariantc                 H    || j                  dd      \  }}| d| d| } | S )Nr  r   )rsplit)rQ  rR  pathr
  s       r   _add_variantrV    s:    !((a0
dq	4&1r   r   	gguf_filer   
user_agentis_remote_codetransformers_explicit_filenamer   c                    |xs
 t               }|j                  d      }|j                  dd      }	|j                  d      }
|j                  dd      }|j                  d      }|j                  d      xs d}|j                  d	d
      }|j                  d      }|5|j                  d      s$|j                  d      s|dk7  rt        d|       d}| I|Ft	        |       } t
        j                  j                  |       }|r|4t
        j                  j                  | ||      }|j                  d      }n|dur}t
        j                  j                  t
        j                  j                  | |t        t        |                  r1t
        j                  j                  | |t        t        |            }n)|durt
        j                  j                  t
        j                  j                  | |t        t        |                  r3t
        j                  j                  | |t        t        |            }d}n|s}t
        j                  j                  t
        j                  j                  | |t        t        |                  r1t
        j                  j                  | |t        t        |            }n'|st
        j                  j                  t
        j                  j                  | |t        t        |                  r3t
        j                  j                  | |t        t        |            }d}n|r t        dt        t        |       d|  d      t        dt        t        |       dt        t        |       d|  d      t
        j                  j                  t
        j                  j                  ||             r| }d}n||}|j                  d      }n%|durt        t        |      }nt        t        |      }||
|||d}|	||dd|d|}t!                xr t#        d       xr
 | xr |d
k(  }	 t%        | |fi |}||t        t        |      k(  rt%        | t        t        |      fi |}|d}nm|rN|dk(  r|rt'        | fi |\  }}}||d<   |Mt        |  dt        t        |       dt        t        |       d      t        t        |      }t%        | |fi |}|2|t        t        |      k(  rt%        | t        t        |      fi |}|d}|R|rt        nt        }|t        t        fv rt)        | |fi |s|rt+        t&        | fddi|d      j-                          ne|3t)        | t        fi |r"t        |  dt        t        |       d | d!      t        |  dt        t        |       dt        t        |       d      |rt0        j3                  d%        |}n[t0        j3                  d% d&        n?|r=t
        j                  j                  |      r|}n||	|
|||||dd|d'}t%        | |fi |}d}|rt5        | ||	|
||||||(      \  }}||fS | gnd}||fS # t        $ r  t.        $ r)}t        d"|  d#|  d$t        t        |       d      |d}~ww xY w))zGet all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
    checkpoints are sharded.
    This function will download the data if necessary.
    	cache_dirforce_downloadFproxieslocal_files_onlytokenrevisionmain	subfolder commit_hashNr   z.safetensors.index.jsonzadapter_model.binzThe transformers file in the config seems to be incorrect: it is neither a safetensors file (*.safetensors) nor a safetensors index file (*.safetensors.index.json): TzError no file named z found in directory r  z, or z, found in directory )ra  r^  r`  r\  r_  )r]  rX  rc   _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries_commit_hashDISABLE_SAFETENSORS_CONVERSIONz& does not appear to have a file named z or zX and thus cannot be loaded with `safetensors`. Please do not set `use_safetensors=True`.ignore_errors_during_conversionzThread-auto_conversion)targetargskwargsr
  z) but there is a file without the variant z;. Use `variant=None` to load this model from those weights.zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named zloading weights file z from cache at )r\  r]  r^  r_  r`  rX  ra  rc  rf  rg  rh  )	r\  r]  r^  r_  r`  rX  ra  rc  rh  )rh   r   r   r   r   r   rU  isdirjoinisfilerV  rP   rO   rR   rQ   OSErrorr   r\   rV   rL   rY   r   r  	Exceptionloggerinforj   )r   rR  rW  r   rX  rY  rZ  r   r\  r]  r^  r_  r`  ra  rc  re  
is_shardedis_localarchive_filefilenamehas_file_kwargscached_file_kwargscan_auto_convertresolved_archive_filesafe_weights_nameer   checkpoint_filess                               r   _get_resolved_checkpoint_filesr    s    &9)9O##K0I$(()95AN!!),G&**+=uE(E"":.8&H##K4I!%%m4K%1-66~FOmOvOv%P
 .1DD `568  J$0Y5F(+,I(J%77==!>?-9!ww||,I9Vtu;DDE^_
 -"''..:I|TegnGop3  "ww||19lK\^e>f  !-"''..:I|TkmtGuv3  "ww||19lKbdk>l  "
$:I|T`biGjk*  "ww||19l<Y`>a  %:I|TfhoGpq*  "ww||19lK]_f>g  "
 *<8I7+S*T U56a9 
 *<8I7+S*TTYZfgsu|Z}Y~ ++H*IL  WW^^BGGLL4QRS8LH .99;DDE^_
 -'(97C'g> %"&$4O #1(&499> +" "" $%% $,-MNN$ '&$ O Y )44QS[(r_q(r% )0XN_ahAi5i,75$%<gF- --)
 -8%)
(#v-2BJY =KASKG18Z :B*:608")#@"A B$$01BG$L#MTR^_vx  SA  RB Bz!z#  $0g#F0;981GY1-
 )0Xl\cAd5d,75$%7A- --)
 -8%)
 )4CM(?Sd% \3E$FF ()FHY m]l m,#2"?!A$Eu#cPb#c!9	
  %'
 *x5|0GV0 &<= >  ,\7 CD E  'y(ce  &<= >  ,\7 CDDVgipIqHrrsu $ KK/~>?$0!KK/zI^H_`a	77>>)$$-!
 '"0"$4($&499> +" %00My$o\n$o! -G)!)-!$.
**  --- 7T6_12ei---{    01N0O P99V8W X::F|U\:]9^^_a
 s   FZ [	 $[[	r  configr   r   c                    |du}| t        | t              r| dk(  rt        |d      r2|j                  &|j                  } t        j                  d|  d       nx|r
d|v r|d   } n*|t        |      } nt        |d   d|      }t        |      } t        j                  d	       n,t        t        |       rt        t        |       } nt        d
      t        | t              rt        t        |       n| } nBt        | t        t        j                  f      s"t        d|        t        j                         } ||j                  |        t        | t              r`| j                  dt        j                               }t        |t              rt        t        |      n|}t        j                  d| d       n| }||_        |j                   D ]  }	t        ||	      x}
||
_         ||fS )a  Find the correct `dtype` to use based on provided arguments. Also update the `config` based on the
    inferred dtype. We do the following:
    1. If dtype is "auto", we try to read the config, else auto-detect dtype from the loaded state_dict, by checking
    its first weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
    2. Else, use the dtype provided as a dict or str
    Nautor   zWill use dtype=z$ as defined in model's config objectr   r   r   z{Since the `dtype` attribute can't be found in model's config object, will use dtype={dtype} as derived from model's weightsze`dtype` provided as a `str` can only be `'auto'`, or a string representation of a valid `torch.dtype`z`dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `dtype` for each sub-config in composite configs, but received rd  zUsing different dtypes per module is deprecated and will be removed in future versions Setting different dtypes per backbone model might cause device errors downstream, therefore setting the dtype=z for all modules.)r   r   hasattrr   rs  rt  r   r   r   r  r   r   r   update_dtyper   warning_oncesub_configs)r   r  r  r   r   r   r   ru  
main_dtypesub_config_key
sub_configs              r   
_get_dtyper    s    "-JeS!67+0H"LLEKK/%8\ ]^!g1A&A 0 9#/ 4Z @%4,Q/fS_&
 !5Z @KKQ &u- { 
 .8s-CGE5)EED%++#67JJOR  '')!!%( %YYr5#:#:#<=
3=j#3NWUJ/T^
!!+,=?	
 
 FL ,, *!&.99JF)J* :r   c                       e Zd ZdZdZy)PipelineParallelr   r   N)r   r   r   inputsoutputsr   r   r   r  r  F  s    FGr   r  c            	           e Zd ZdZedej                  fd       Zedej                  fd       ZdedefdZ	e
d        Z	 dd	ed
eedf   dej                  dz  defdZddededefdZy)ModuleUtilsMixinzH
    A few utilities for `torch.nn.Modules`, to be used as a mixin.
    r   c                 B    t        d | j                         D              S )z
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        c              3   4   K   | ]  }|j                     y wr   r   r+  params     r   r-  z*ModuleUtilsMixin.device.<locals>.<genexpr>V  s     @UELL@s   r   
parametersr   s    r   r   zModuleUtilsMixin.deviceP  s     @doo.?@@@r   c                 B    t        d | j                         D              S )zw
        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
        c              3   V   K   | ]!  }|j                         s|j                   # y wr   )r   r   r  s     r   r-  z)ModuleUtilsMixin.dtype.<locals>.<genexpr>]  s     \E%BYBYB[EKK\s   ))r  r   s    r   r   zModuleUtilsMixin.dtypeX  s    
 \T__->\\\r   encoder_attention_maskc                     |j                         dk(  r|dddddddf   }|j                         dk(  r|ddddddf   }j                  | j                        }d|z
  t        j                  | j                        j
                  z  }|S )z
        Invert an attention mask (e.g., switches 0. and 1.).

        Args:
            encoder_attention_mask (`torch.Tensor`): An attention mask.

        Returns:
            `torch.Tensor`: The inverted attention mask.
           Nr  r         ?)dimr   r   r   finfomin)r   r  encoder_extended_attention_masks      r   invert_attention_maskz&ModuleUtilsMixin.invert_attention_mask_  s     "%%'1,.DQaQR].S+!%%'1,.DQdTUEU.V+ +J*L*LSWS]S]*L*^'+.1P+PTYT_T_`d`j`jTkToTo*o'..r   c                    |j                   }| \  }}t        j                  ||      }|d d d d f   j                  ||d      |d d d d f   k  }|j	                  |j
                        }|j                  d   |j                  d   k  r[|j                  d   |j                  d   z
  }t        j                  t        j                  |||f||j
                        |gd      }|d d d d d d d f   |d d d d d d f   z  }|S )Nr  r   r   r   r   axis)	r   r   arangerepeatr   r   shapecatones)	input_shapeattention_maskr   
batch_size
seq_lengthseq_idscausal_maskprefix_seq_lenextended_attention_masks	            r   *create_extended_attention_mask_for_decoderz;ModuleUtilsMixin.create_extended_attention_mask_for_decoderu  s   &&!,
J,,z&9dD!m,33J
ANRYZ^`acgZgRhh!nn^%9%9:Q."6"6q"99+11!4{7H7H7KKN))JJ
JGPV^i^o^op K #.aq!m"<~aQUW[]^N^?_"_&&r   Nr  r  .r   c                    || j                   }|j                         dk(  r|dddddddf   }nk|j                         dk(  r<t        | j                  dd      rt        j                  ||      }n*|ddddddf   }nt        d| d|j                   d      |j                  |      }d	|z
  t        j                  |      j                  z  }|S )
a  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nr  r  
is_decoderz!Wrong shape for input_ids (shape z) or attention_mask (shape )r  r  )r   r  r  r  r  r  r   r  r   r   r  r  )r   r  r  r   r  s        r   get_extended_attention_maskz,ModuleUtilsMixin.get_extended_attention_mask  s    $ =JJE 1$&4Qa]&C#!Q& t{{L$7*:*e*e+' +9D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<5"<"I#&)@#@EKKPUDVDZDZ"Z&&r   only_trainableexclude_embeddingsc                 >   |r@| j                         D cg c]%  \  }}t        |t        j                        s!| d' }}}t	        | dd      }|rddl}d}| j                         D ]  \  }}	|r|v r|	j                  s|r|r|t        |	j                  j                        r\t        |	d      r|	j                         }
n%t        |	d      r|	j                  j                  }
nd}
||	j                         d	z  |
z  z  }||	j                         z  } |S c c}}w )
a  
        Get number of (optionally, trainable or non-embeddings) parameters in the module.

        Args:
            only_trainable (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of trainable parameters

            exclude_embeddings (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of non-embeddings parameters

        Returns:
            `int`: The number of parameters.
        z.weightis_loaded_in_4bitFr   Nr   quant_storager   r  )r  r   r   	Embeddingr  bitsandbytesnamed_parametersrJ  
Params4bitr  r   r  itemsizenumel)r   r  r  r
  module_typeembedding_param_namesr  bnbtotal_paramsr  	num_bytess              r   num_parameterszModuleUtilsMixin.num_parameters  s!    :>:L:L:N%%6T;R\]hjljvjvRw4& %! % $D*=uE&002 	2KD%!d.C&C"". %E366;L;L)Mun5$)$6$6$8	 8$)$7$7$@$@	$%	 EKKMA$5	$AAL EKKM1L	2" 5%s
   "DDr   FF)r   r   r   r   r   r   r   r   r   r  staticmethodr  tupler   r  r   r  r   r   r   r  r  K  s     A A A ]u{{ ] ]/F /v /, ' '2 %)	/'/' 38_/' {{T!	/'
 
/'b*T *t *`c *r   r  c                   `    e Zd ZdZdZdej                  fdZdej                  fdZd Z	d Z
y	)
EmbeddingAccessMixinz
    Base utilities to regroup getters and setters for embeddings.
    Introduces the `input_layer_embed` attribute, which indicates
    where the input embeddings come from and where they
    should be set.
    embed_tokensr   c                    t        | dd      }t        | |d      x}|S t        | d      r,t        | j                  |      rt        | j                  |      S t        | d      r,t        | j                  |      rt        | j                  |      S t        | d      r"| j                  }||| ur|j                         S t        d| j                  j                   d      )	z
        Returns the model's input embeddings.

        Returns:
            `nn.Module`: A torch module mapping vocabulary to hidden states.
        _input_embed_layerr  N
embeddingsr%  
base_modelu.   `get_input_embeddings` not auto‑handled for "; please override in the subclass.)	r  r  r  r%  r  get_input_embeddingsNotImplementedError	__class__r   )r   r
  default_embeddingr  s       r   r  z)EmbeddingAccessMixin.get_input_embeddings  s     t1>B ")tT!::G$$4&74??D+I4??D114!gdjj$&?4::t,,4&J%*D*@!6688!<T^^=T=T<UUwx
 	
r   valuec                    t        | dd      }t        | |      rt        | ||       yt        | d      r.t        | j                  |      rt        | j                  ||       yt        | d      r.t        | j                  |      rt        | j                  ||       yt        | d      r*| j
                  | ur| j
                  j                  |       yt        d| j                  j                   d      )	a  Fallback setter that handles **~70%** of models in the code-base.

        Order of attempts:
        1. `self.<_input_embed_layer>` (direct attribute)
        2. `self.embeddings.<_input_embed_layer>` (nested embeddings for vision/audio models)
        3. `self.model.<_input_embed_layer>` (encoder/decoder models)
        4. delegate to the *base model* if one exists
        5. otherwise raise `NotImplementedError` so subclasses still can (and
            should) override for exotic layouts.
        r  r  r  r%  r  u.   `set_input_embeddings` not auto‑handled for r  N)
r  r  rM  r  r%  r  set_input_embeddingsr  r  r   )r   r  r
  s      r   r  z)EmbeddingAccessMixin.set_input_embeddings  s     t1>B4D$&T<(WT__d-KDOOT51T7#

D(ADJJe,T<(T__D-HOO007%@AXAX@YY{| r   c                 t    t        | d      sy 	 | j                          | j                  S # t        $ r Y y w xY w)Nlm_head)r  r  r  r  r   s    r   get_output_embeddingsz*EmbeddingAccessMixin.get_output_embeddings.  sB    tY'	 %%' || # 		s   + 	77c                 ,    t        | d      r|| _        yy)ze
        Sets the model's output embedding, defaulting to setting new_embeddings to lm_head.
        r  N)r  r  )r   new_embeddingss     r   set_output_embeddingsz*EmbeddingAccessMixin.set_output_embeddings9  s     4#)DL $r   N)r   r   r   r   r  r   Moduler  r  r  r  r   r   r   r  r    s9     (
bii 
:")) <	*r   r  c                       e Zd ZU dZdZee   dz  ed<   dZdZ	e
ed<   dZeed<   dZee
   dz  ed<   d	Ze
ed
<   dZe
ee
   z  ed<   dZee
   ee
   z  dz  ed<   dZe
ee
   z  dz  ed<   dZee
   ee
   z  dz  ed<   dZee
   ee
   z  dz  ed<   dZee
e
f   ed<   i Zee
e
f   ed<   dZee
   dz  ed<   dZee
   dz  ed<   dZee
   dz  ed<   dZeed<   dZeed<   dZeed<   dZee
e
f   ed<   dZ dZ!ee
e"f   dz  ed<   dZ#eed<   dZ$eed<   dZ%eed<   dZ&edz  ed<   e'e(jR                  jT                  dee
e+f   fd               Z,e'dee
e(jZ                  f   fd!       Z. fd"Z/d#ef fd$Z0d% Z1e'dee
e
f   fd&       Z2e'dee
e3e
e
f   f   fd'       Z4e2jj                  d(ee
e
f   dz  fd)       Z2e4jj                  d(ee
e3e
e
f   f   fd*       Z4dd+Z6d, Z7d-ee
   e
z  ddfd.Z8e9d/        Z:e'de;jx                  fd0       Z=e9defd1       Z>dd2edefd3Z?dd2edefd4Z@dd2edefd5ZAdefd6ZBdd2edefd7ZC	 dd8e
dz  d2ede
fd9ZDd:e
dz  de
fd;ZEdd<e
dz  d2ede
fd=ZFd>e
dz  de
fd?ZGe9defd@       ZHe9defdA       ZId8e
ez  fdBZJd:e
ez  fdCZKdD ZLdE ZMddFe
dz  fdGZNddFe
dz  fdHZOdI ZPdJ ZQ e(j                         dK        ZSdL ZT e(j                          eUj                         dM               ZWddNedefdOZXddQee
   dz  dRefdSZYdT ZZ	 	 	 ddUe[dz  dVe[dz  dWede;j                  fdXZ]ddYZ^	 	 	 ddZe;j                  dUe[dz  dVe[dz  dWede;j                  f
d[Z_	 	 	 dd\e;j                  dUe[dz  d]edWede;j                  f
d^Zad_ Zb	 dd]efd`Zcda Zddb Zedce[fddZfde;j                  e3e;j                     z  fdeZgdf ZhddgZidPejfdhediekfdjZldk Zme'defdl       Zn	 	 	 	 	 	 	 	 ddme
eoj                  z  dnedoedz  dpedqe[e
z  dre
dz  dse
ez  dz  dteduefdvZq eresj                         fdw       ZtddxZu ere(jv                  jx                  j                         fdy       Zv ere(jv                  jx                  j                         fdz       Zw fd{Zx fd|Zye9d}e(j                  d~edefd       Z{d}e(j                  defdZ|dde}dz  fdZ~e9dddddddddPd	dee   de
eoj                  z  dz  d#ee
z  eoj                  z  dz  de
eoj                  z  dz  dedededse
ez  dz  de
dedz  dedefd       Zedd doedz  dee
   dz  dede3eef   f
d       Zedededefd       ZddZe9dd       Zd Ze'd        Ze'd        Ze'd        Ze'd        Zejj                  d        ZddZe'defd       Zejj                  deddfd       Zdedz  dekfdZe9d        ZdQee
   ddfdZdQee
   dedz  dddedz  ddf
dZd~eddfdZdeddfdZd Zde
fdZ	 ddededee3e
e(jZ                  f      fdZddef fdZd Z xZS )rz   a  
    Base class for all models.

    [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
    downloading and saving models as well as a few methods common to all models to:

        - resize the input embeddings

    Class attributes (overridden by derived classes):

        - **config_class** ([`PreTrainedConfig`]) -- A subclass of [`PreTrainedConfig`] to use as configuration class
          for this model architecture.
        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
          classes of the same architecture adding modules on top of the base model.
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
        - **can_record_outputs** (dict):
    Nconfig_classrd  base_model_prefixF_is_stateful
model_tags	input_idsmain_input_nametextinput_modalities_no_split_modules_skip_keys_device_placement_keep_in_fp32_modules_keep_in_fp32_modules_strictr  _checkpoint_conversion_mapping_keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpected_keys_to_ignore_on_save_supports_sdpa_supports_flash_attn_supports_flex_attn_tp_plan_pp_plansupports_gradient_checkpointing_can_compile_fullgraph_supports_attention_backend_can_record_outputsr   c                 "    | j                   xs i S )a  
         Maps output names (e.g., "attentions", "hidden_states")
         to either:
             - A module class (e.g., `LlamaDecoderLayer`), using default index conventions:
                 * index=0 for "hidden_states"
                 * index=1 for "attentions"
             - Or an `OutputRecorder(...)` with `target_class`, optional `index`, and `layer_name`.

         Examples:
             These two are equivalent:

         ```python
             _can_record_outputs = {
                 "attentions": LlamaAttention,
                 "hidden_states": LlamaDecoderLayer
             }

             _can_record_outputs = {
                 "attentions": OutputRecorder(LlamaAttention, index=1),
                 "hidden_states": OutputRecorder(LlamaDecoderLayer, index=0)
             }
        ```

         This means you can record outputs from the same class, by specifying a layer name. Before
         collecting outputs, we check that they come from this layer.

         If you have cross attention that come from `LlamaAttention` and self attention that also
         come from `LlamaAttention` but from `self_attn` you can do this:

         ```python
         class LlamaModel(PreTrainedModel):
             _can_record_outputs = {
                 "attentions": OutputRecorder(LlamaAttention, index=1, layer-name="self_attn"),
                 "cross_attentions": OutputRecorder(LlamaAttention, index=1, layer_name="cross_attn")
             }

        ```
        )r  r   s    r   can_record_outputsz"PreTrainedModel.can_record_outputs  s    R ''-2-r   c                 8    dt        j                  t              iS )z^
        `dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
        r  )r   r   rN   r   s    r   dummy_inputszPreTrainedModel.dummy_inputs  s    
 U\\,788r   c                 R   t        |   di | | j                  j                  di       j                  dd       }| j                  j                  dd       }t	        |       j                  dd       }| j
                  }||| _        y ||| _        y ||| _        y ||| _        y y )Nr   r  r  r   )super__init_subclass____dict__r   r   r  )clsrm  child_annotationchild_attributefull_annotationfull_attributer  s         r   r  z!PreTrainedModel.__init_subclass__  s    !+F+ <<++,=rBFFxQUV,,**>4@ )-11(DA)) &.C)/C'-C(.C )r   r  c                 V   t         |           t        |t              s:t	        d| j
                  j                   d| j
                  j                   d      || _        |j                  | _        | j                  | j                  j                  d      | j                  _        | j                  | j                  j                        | j                  _        | j                         rt!        j"                  |      | _        | j
                  j                  }|t&        vrYddj)                  t&               d}t+        j,                  || j
                  j                        }t/        |      d	kD  r|d	   }nd }|| _        | j2                  t4        t7        | j
                        <   y )
NzParameter config in `zt(config)` should be an instance of class `PreTrainedConfig`. To create a model from a pretrained model use `model = z(.from_pretrained(PRETRAINED_MODEL_NAME)`Tis_init_check(|r  r   )r  __init__r   r   	TypeErrorr  r   r  name_or_path%_check_and_adjust_attn_implementation_attn_implementation_attn_implementation_internal(_check_and_adjust_experts_implementation_experts_implementation _experts_implementation_internalcan_generater(   from_model_configgeneration_configrD   ro  r)  findallr   	loss_typer  rp   r   )r   r  r  rm  r  loss_groupsr  s         r   r  zPreTrainedModel.__init__  si   &"23'(?(?'@ A NN3344\^ 
 "// 594^4^KK,,D 5_ 5
1
 8<7d7dKK//8
4 %5%G%G%OD" NN++	L(chh|45Q7K

;0G0GHI9~!%aL	 	"484L4LS01r   c                 Z   i i i c| _         | _        | _        | j                  | u r| j                  j
                  $| j                  j
                  j                         ni | _        | j                  j                  $| j                  j                  j                         ni | _         | j                  j                  $| j                  j                  j                         ni | _        | j                  d      | _
        t        | j                  xs g       | _        t        | j                  xs g       | _        t        | j                  xs g       | _        | j                         D ]  \  }}t!        |dd      x}rP| j                  j#                  |j                         j%                         D ci c]  \  }}| d| | c}}       t!        |dd      x}rP| j                   j#                  |j                         j%                         D ci c]  \  }}| d| | c}}       t!        |dd      x}rP| j                  j#                  |j                         j%                         D ci c]  \  }}| d| | c}}       t!        |dd      x}rU| j                  j#                  |j                         j%                         D ci c]  \  }}| d| | d|  c}}       t!        |d	d      x}r| j                  j#                  |       t!        |d
d      x}r| j                  j#                  |       t!        |dd      x}	s| j                  j#                  |	       
 | j'                          | j)                          yc c}}w c c}}w c c}}w c c}}w )a  
        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
        modules properly initialized (such as weight initialization).
        It is also used to obtain all correct static properties (parallelism plans, tied_weights_keys, _keep_in_fp32_modules, etc)
        correctly in the case of composite models (that is, the top level model should know about those properties from its children).
        NFall_submodels_ep_planr  r  r  all_tied_weights_keysr  r  r  )r  r#  r  r  r  base_model_pp_plancopybase_model_tp_planbase_model_ep_planget_expanded_tied_weights_keysr$  r!  r  r  r  named_childrenr  updater.  init_weights._backward_compatibility_gradient_checkpointing)
r   r
  r  planr   v	tied_keys	keep_fp32keep_fp32_strictno_splits
             r   	post_initzPreTrainedModel.post_init  s2    79"b3t}dm??d"EI[[EcEcEoDKK::??AuwDMEI[[EcEcEoDKK::??AuwDMEI[[EcEcEoDKK::??AuwDM%)%H%HW\%H%]"%()C)C)Ir%J",/0Q0Q0WUW,X)!$T%;%;%Ar!B !//1 	8LD&vz488t8$$499;CTCTCV%W41aasmQ&6%WXvz488t8$$499;CTCTCV%W41aasmQ&6%WXvz488t8$$499;CTCTCV%W41aasmQ&6%WX#F,CTJJyJ**11\e\j\j\l\r\r\t2uTXTUWXdV1QC=TF!A3-3O2uv#F,CTJJyJ**11)<#*63QSW#XXX11889IJ"6+>EExE&&--h7%	8* 	;;=' &X%W%W 3vs   5NN3N!N'c                     t        | j                  d      r,| j                  j                  j                  r| j                  S | j
                  S )z:
        The full tp plan for the model's modules
        distributed_config)r  r  r6  enable_expert_parallelr#  r  r   s    r   tp_planzPreTrainedModel.tp_plan3  s9    
 4;; 45$++:X:X:o:o== }}r   c                     | j                   S r   r  r   s    r   pp_planzPreTrainedModel.pp_plan<  s    }}r   r.  c                    |i | _         y t        |t              st        d      |j	                         D ]<  \  }}|t
        vst        d| d| dt        t        j                                       | j                         D cg c]  \  }}|	 }}}|j                         D ]S  }|j                  dd      }d}|D ]  }	t        j                  ||	      sd} n |r;t        j                  d	| d
       U || _         y c c}}w )Nz&Can only set a dictionary as `tp_plan`z#Unsupported tensor parallel style 'z' for layer 'z'. Supported styles are *z\d+FTzLayer pattern 'z' does not match any parameters in the model. This rule may not be applied during tensor parallelization, or may lead to dimension mismatches)r  r   r   r   r.  r=   r   r   r  replacer)  matchwarningswarn)
r   r.  layer_patternparallel_styler
  r  model_param_namesregex_patternpattern_matchedrH  s
             r   r8  zPreTrainedModel.tp_plan@  s1   <DM$%EFF .2ZZ\ 	)M>%88 9.9IWdVe f,,01D1I1I1K,L+MO 	 261F1F1HIgdATII!YY[ 	M)11#v>M#O/ 
88M:6&*O #%m_ 5d d	 ! Js   Dc                     || _         y r   r:  )r   r.  s     r   r;  zPreTrainedModel.pp_pland  s	    r   c                 \    t        | dd      }|t        d      |j                  | |      S )z
        Potentially dequantize the model in case it has been quantized by a quantization method that support
        dequantization.
        r   Nz?You need to first quantize your model in order to dequantize itr  )r  r   
dequantize)r   r   r   s      r   rI  zPreTrainedModel.dequantizeh  s:    
 t^T:^__&&t5&99r   c                     | j                   r?t        | j                  dd      r'| j                          t	        | j                  d       y y y )Ngradient_checkpointingF)r  r  r  gradient_checkpointing_enabledelattrr   s    r   r-  z>PreTrainedModel._backward_compatibility_gradient_checkpointingt  s@    //GDKKIach4i..0DKK!9: 5j/r   tagsc                     t        |t              r|g}| j                  g | _        |D ],  }|| j                  vs| j                  j                  |       . y)a\  
        Add custom tags into the model that gets pushed to the Hugging Face Hub. Will
        not overwrite existing tags in the model.

        Args:
            tags (`Union[list[str], str]`):
                The desired tags to inject in the model

        Examples:

        ```python
        from transformers import AutoModel

        model = AutoModel.from_pretrained("google-bert/bert-base-cased")

        model.add_model_tags(["custom", "custom-bert"])

        # Push the model to your namespace with the name "my-custom-bert".
        model.push_to_hub("my-custom-bert")
        ```
        N)r   r   r  r  )r   rN  tags      r   add_model_tagszPreTrainedModel.add_model_tagsz  sS    , dC 6D??" DO 	,C$//)&&s+	,r   c                    |j                  d|j                        }|j                  dd      x}(t        j                  d       ||j                  k7  r|n|}t	        |t
              rt        t        |      }d|v r|j                  d      |_        d|v r|j                  d      |_	        g }|%|j                  t        || j                               t               rbt        s\t        sVt        j!                  d       ddl}|j%                  |j&                  j)                  t+               	      t-               g       t/        |      5   | |fi |}ddd       |S # 1 sw Y   S xY w)
z
        All context managers that the model should be initialized under go here.

        Args:
            dtype (`torch.dtype`, *optional*):
                Override the default `dtype` and load the model under this dtype.
        r   torch_dtypeNz1`torch_dtype` is deprecated! Use `dtype` instead!attn_implementationexperts_implementation@Detected DeepSpeed ZeRO-3: activating zero.init() for this modelr   config_dict_or_path)r  r   rs  r  r   r   r  r   r  r  r  r   r   r+   r   r   rt  	deepspeedr  zeroInitr*   r   rS   )r  r  rm  r   rS  init_contextsrY  r%  s           r   _from_configzPreTrainedModel._from_config  sC    

7FLL1!::mT::KG ST"fll2EEeS!E5)E !F**0**5J*KF' $v--3ZZ8P-QF*  !25#,,!GH%'FXKKZ[   ).."5"5JZJ\"5"]_n_p!qr ]+ 	*)&)E	* 	* s   
E!!E+c                 0    t        | | j                  |       S )z@
        `torch.nn.Module`: The main body of the model.
        )r  r  r   s    r   r  zPreTrainedModel.base_model  s    
 tT33T::r   c                    dt        | j                        v ry| j                  D ]/  }t        |d      sdt        |      vs|j                         s/ y t        | d      r"t        j                  | j                   d       y)a  
        Returns whether this model can generate sequences with `.generate()` from the `GenerationMixin`.

        Under the hood, on classes where this function returns True, some generation-specific changes are triggered:
        for instance, the model instance will have a populated `generation_config` attribute.

        Returns:
            `bool`: Whether this model can generate sequences with `.generate()`.
        GenerationMixinTr  rz   prepare_inputs_for_generationu6   has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.F)r   	__bases__r  r  rs  warningr   )r  bases     r   r  zPreTrainedModel.can_generate  s     CMM 22MM 	D40 D	1d6G6G6I		 378NN<<. 	!  	  r   r  c                 <   | j                   j                  }| j                  sFt        | dd      s9t	        | j
                  j                   d| j                   j                   d      t               sd}d}t               rt        j                  d       yt               r!t        j                  d	t        d
    d       yt        j                  j!                  d      t#        | d|       t%        j&                  t        j(                  j%                  d            }t*        j$                  j,                  rg|t%        j&                  d      k  rt#        | d| d|       t*        j,                  j/                         st	        | d      t#        | d|       t*        j$                  j0                  r;|t%        j&                  d      k  rt#        | d| d|       t#        | d|       |t        j3                  d       nT|R|t*        j4                  t*        j6                  fvr0t        j3                  d| j
                  j                   d| d       |st9        | j;                         D ch c]  }|j<                   c}      }t?        |      dk(  rq|d   j@                  dk(  r_t*        j,                  j/                         rt        j3                  d       ytC               rt        j3                  d       yt	        d      yc c}w )aN  
        Check the availability of Flash Attention 2 for a given model.

        Args:
            is_init_check (`bool`, *optional*):
                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
                fully instantiated. This is needed as we also check the devices of the weights, which are only available
                later after __init__. This allows to raise proper exceptions early before instantiating the full models
                if we know that the model does not support the requested attention.
        _supports_flash_attn_2Fz does not support Flash Attention 2.0 yet. Please request to add support where the model is hosted, on its model hub page: https://huggingface.co/k/discussions/new or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/newzVFlashAttention2 has been toggled on, but it cannot be used due to the following error:zPlease refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.z+Detect using FlashAttention2 on Ascend NPU.Tz*Detect using FlashAttention2 (via kernel `r|   z
`) on XPU.
flash_attnz3 the package flash_attn seems to be not installed. z2.1.0zY you need flash_attn package version to be greater or equal than 2.1.0. Detected version z. z\ Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device.z% Flash Attention 2 is not available. z2.0.4zY you need flash_attn package version to be greater or equal than 2.0.4. Detected version zuYou are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviourzaFlash Attention 2 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in  is a;  . You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)`r   r   r   zYou are attempting to use Flash Attention 2 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.zYou are attempting to use Flash Attention 2 with a model not initialized on MLU. Make sure to move the model to MLU after initializing it on CPU with `model.to('mlu')`.a+  You are attempting to use Flash Attention 2 with a model not initialized on GPU and with no GPU available. This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU.)"r  r   r  r  r   r  r   _name_or_pathr]   rc   rs  rt  rd   FLASH_ATTN_KERNEL_FALLBACK	importlibutil	find_specImportErrorr   parsemetadatar   cudar   hipr  float16bfloat16r   r  r   r   r0  rb   )r   r  r   prefaceinstall_messageflash_attention_versionr  param_devicess           r   _flash_attn_2_can_dispatchz*PreTrainedModel._flash_attn_2_can_dispatch  sT    !! ))WT;SUZ-[>>**+ ,WW[WbWbWpWpVq rnn  )*nG pO &'IJ%'@A[\oAp@qq{| ~~''5=!WI-`ap`q"rss +2--	8J8J8R8RS_8`*a'==%%.w1GG)&i  (A  BY  AZ  Z\  ]l  \m  n  #ZZ446(&i  (D  E  *WI5Z[jZk*lmm]]&&.w1GG)&i  (A  BY  AZ  Z\  ]l  \m  n  *WI5Z[jZk*lmm= H 50O#O((,(?(?'@UG LBB  DOO<M!N5%,,!NOM=!Q&=+;+@+@E+I::**,''Q"  ,-''P  %V  ) "Os   ?Lc                 h   | j                   j                  }| j                  s9t        | j                  j
                   d| j                   j                   d      t               sd}t        j                  j                  d      t        | d      t        j                  j                         rdt        j                  j                         \  }}|dk  r0t        | dt        j                  j                          d| d	      t        | d
      t        | d      |t         j#                  d       nT|R|t        j$                  t        j&                  fvr0t         j#                  d| j                  j
                   d| d       t)        | j                   dd      st)        | j                   dd      rt        d      t+        | j                   d      r<| j                   j,                  dkD  r#t        d| j                   j,                   d      |st/        | j1                         D ch c]  }|j2                   c}      }t5        |      dk(  rQ|d   j6                  dk(  r?t        j                  j                         rt         j#                  d       yt        d      yc c}w )aN  
        Check the availability of Flash Attention 3 for a given model.

        Args:
            is_init_check (`bool`, *optional*):
                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
                fully instantiated. This is needed as we also check the devices of the weights, which are only available
                later after __init__. This allows to raise proper exceptions early before instantiating the full models
                if we know that the model does not support the requested attention.
        z does not support Flash Attention 3 yet. Please request to add support where the model is hosted, on its model hub page: https://huggingface.co/rg  zVFlashAttention3 has been toggled on, but it cannot be used due to the following error:flash_attn_3z4 the package flash_attn_3 seems to be not installed.	   zA Flash Attention 3 requires compute capability >= 9.0, but found z with compute capability z.0.z$ Flash Attention 3 is not available.z\ Flash Attention 3 is not available on CPU. Please make sure torch can access a CUDA device.zuYou are attempting to use Flash Attention 3 without specifying a torch dtype. This might lead to unexpected behaviourzaFlash Attention 3 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in ri  a?  . You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_3", dtype=torch.float16)`alibiF	use_alibizNModel is configured to use ALiBi, which is not supported by Flash Attention 3.attention_dropoutr   zModel has attention_dropout=z., which is not supported by Flash Attention 3.r   r   zYou are attempting to use Flash Attention 3 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.a+  You are attempting to use Flash Attention 3 with a model not initialized on GPU and with no GPU available. This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU.T)r  r   r  r   r  r   rj  r^   rl  rm  rn  ro  r   rr  r   get_device_capabilityrs  r  rt  ru  r  r  r  r   r  r   r   r0  )r   r  r   rv  majorr  r  ry  s           r   _flash_attn_3_can_dispatchz*PreTrainedModel._flash_attn_3_can_dispatchR  s    !!((>>**+ ,WW[WbWbWpWpVq rnn  )*nG~~''7?!WI-a"bcczz&&( ::;;=q19$")#dejeoeo  fF  fF  fH  eI  Ib  ch  bi  il  m  &	1U&VWW i{|  = H 50O#O((,(?(?'@UG LFF 4;;/74;;UZ3[mnn 4;; 349V9VYZ9Z.t{{/L/L.MM{| 
  DOO<M!N5%,,!NOM=!Q&=+;+@+@E+I::**,''Q  %V   "Os   5J/c                    | j                   s"t        | j                  j                   d      t        j
                  j                  t        j                  j                         dkD  rwt        j                  t        j                        t        j                  d      k  r>t        j                  d       t        j                  j                  j                  d       y)aA  
        Check the availability of SDPA for a given model.

        Args:
            is_init_check (`bool`, *optional*):
                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
                fully instantiated. This is needed as we also check the devices of the weights, which are only available
                later after __init__. This allows to raise proper exceptions early before instantiating the full models
                if we know that the model does not support the requested attention.
        a   does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`r   z2.4.1zUsing the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.FT)r  r   r  r   r   r   rs  rr  device_countrp  ru   rs  r  backendsenable_flash_sdpr   r  s     r   _sdpa_can_dispatchz"PreTrainedModel._sdpa_can_dispatch  s     "">>**+ ,O O  MM)

'')A-e//07==3II y NN007r   c                     | j                         s"t        | j                  j                   d      t	               st        d      y)zI
        Check the availability of Grouped MM for a given model.
        z1 does not support setting experts implementation.zYPyTorch Grouped MM requirements in Transformers are not met. Please install torch>=2.9.0.T)_can_set_experts_implementationr   r  r   r_   ro  r   s    r   _grouped_mm_can_dispatchz(PreTrainedModel._grouped_mm_can_dispatch  sI    
 335 7 788ijkk&(k 
 r   c                     | j                   s"t        | j                  j                   d      t	               st        d      y)aK  
        Check the availability of Flex Attention for a given model.

        Args:
            is_init_check (`bool`, *optional*):
                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
                fully instantiated. This is needed as we also check the devices of the weights, which are only available
                later after __init__. This allows to raise proper exceptions early before instantiating the full models
                if we know that the model does not support the requested attention.
        a   does not support an attention implementation through torch's flex_attention. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`z]PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0.T)r  r   r  r   ra   ro  r  s     r   _flex_attn_can_dispatchz'PreTrainedModel._flex_attn_can_dispatch  sS     ''>>**+ ,t t  ,-o 
 r   rT  c                    |}|duxr |j                  d      }|duxr* |j                  d      dk(  xs |j                  d      dk(  }|rs| j                  rgt               s]t	               sSt               rIt               s?t        |j                  d         }t               r|j                  d      dk(  rd}|rd| }t        |      r7	 |rt        |       nt        |       |rt        j                  d| d       |S | j%                  ||      }t'        |	      rt        |       |S # t        $ r<}|r3|j                  d      r| j!                          || j#                          |d}~ww xY w)
a  
        Check that the `attn_implementation` exists and is supported by the models, and try to get the kernel from hub if
        it matches hf kernels pattern.

        Args:
            attn_implementation (`str` or `None`):
                The attention implementation to check for existence/validity.
            is_init_check (`bool`, *optional*):
                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
                fully instantiated. This is needed as we also check the devices of the weights, which are only available
                later after __init__. This allows to raise proper exceptions early before instantiating the full models
                if we know that the model does not support the requested attention.

        Returns:
            `str`: The final attention implementation to use, including potential fallbacks from sdpa to eager, or from
            None to sdpa (to potentially eager).
        Nzpaged|r|   r}   Fz/You do not have `flash_attn` installed, using `z%` from the `kernels` library instead!2)"requested_attention_implementation)
startswithremoveprefixr  r]   r^   r`   rc   rk  rd   r9   rF   rE   rs  r  rr  r   rz  r  get_correct_attn_implementationrg   )r   rT  r  applicable_attn_implementationis_pagedrequested_original_flash_attnr~  s          r   r  z5PreTrainedModel._check_and_adjust_attn_implementation  s   ( *=&&d2_7J7U7UV^7_ )<4(G )
,,X6:MM Q"//9=PP 	&
 *)).04M4O$&*,-GH[HhHhiqHr-s*%',?,L,LX,VZm,m 16-39:X9Y1Z.3456TU/0NO 1''IJhIi j> >. .- .2-Q-Q..*
 ,Omn+,JK--'  	0*33C8779
  779 	s   4D, ,	E157E,,E1rU  c                 (    | j                  |      }|S )a>  
        Check that the `experts_implementation` exists and is supported by the models.

        Args:
            experts_implementation (`str` or `None`):
                The experts implementation to check for existence/validity.
        Returns:
            `str`: The final experts implementation to use.
        )"get_correct_experts_implementation)r   rU  !applicable_experts_implementations      r   r  z8PreTrainedModel._check_and_adjust_experts_implementation9  s     -1,S,STj,k)00r   requested_attentionc                    |dn|}|dgt         j                         z   vrTd| d}| j                  st        | dd      r|dz  }| j                  r|dz  }| j
                  r|d	z  }t        |d
z         d|v r| j                  |       |S d|v r| j                  |       |S d|v r| j                  |       |S d|v r	 | j                  |       |S |S # t        t        f$ r}|d|v r|d}Y d }~|S d }~ww xY w)Nsdpaeager Specified `attn_implementation="zc"` is not supported. The only possible arguments are `attn_implementation="eager"`, `"paged|eager"`rf  Fz, `"attn_implementation=flash_attention_3"`, `"attn_implementation=flash_attention_2"`, `"attn_implementation=paged|flash_attention_2"`zB, `"attn_implementation=sdpa"`, `"attn_implementation=paged|sdpa"`z(, `"attn_implementation=flex_attention"`r  r|   r}   flex_attention)ALL_ATTENTION_FUNCTIONS
valid_keysr  r  r  r  r   rz  r  r  r  ro  )r   r  r  applicable_attentionmessager~  s         r   r  z/PreTrainedModel.get_correct_attn_implementationF  si   )<)DvJ]y3J3U3U3W'WW23G2H IA A 
 ((GD:RTY,Z  e  e""__''EEWs]++ "66++M: $# !$88++M: $# !55((7 $# ++/''6 $### , /&2vAT7TG'.$##/s   ?C C8#
C33C8requested_expertsc                     |dn|}|dvrd| d}t        |      |dk(  r	 | j                          |S |S # t         t        f$ r}|dk(  r|d}Y d }~|S d }~ww xY w)N
grouped_mm)r  r  
batched_mmz#Specified `experts_implementation="z"` is not supported. The only possible arguments are `experts_implementation="eager"`, `"experts_implementation=grouped_mm"` and `"experts_implementation=batched_mm"`.r  )r   r  ro  )r   r  applicable_expertsr  r~  s        r   r  z2PreTrainedModel.get_correct_experts_implementationh  s    ->-F\L]%JJ56H5I JE E  W%% ----/ "!!! , -$4G%,"!!-s   6 A	AAc                    t         j                  | j                     }t        |d      sy|j                  }t        |dd      5 }|j                         }ddd       t        j                  d      r
d|v xr d	|v S y
# 1 sw Y   *xY w)zDetect whether the class supports setting its attention implementation dynamically. It is an ugly check based on
        opening the file, but avoids maintaining yet another property flag.
        __file__Frutf-8encodingNzclass \w+Attention\(nn.Module\)eager_attention_forwardz&ALL_ATTENTION_FUNCTIONS.get_interface(T)	sysmodulesr   r  r  openreadr)  r*  r  class_module
class_filer   codes        r   _can_set_attn_implementationz,PreTrainedModel._can_set_attn_implementation|  s    
 {{3>>2|Z0!**
*cG4 	668D	 997>,4i9aei9ii 	 	s   A??Bc                     t         j                  | j                     }t        |d      sy|j                  }t        |dd      5 }|j                         }ddd       d|v S # 1 sw Y   dv S xY w)zDetect whether the class supports setting its experts implementation dynamically. It is an ugly check based on
        opening the file, but avoids maintaining yet another property flag.
        r  Fr  r  r  Nz@use_experts_implementation)r  r  r   r  r  r  r  r  s        r   r  z/PreTrainedModel._can_set_experts_implementation  sq    
 {{3>>2|Z0!**
*cG4 	668D	 -44	 -44s   A""A.c                    t        |t              s|n%|j                  d| j                  j                        }|| j                  j                  k7  ra| j                         s-t        j                  | j                  j                   d       n$| j                  |d      }|| j                  _        | j                         D ]E  }|| us	t        |t              s|j                  j                  | j                  j                  k7  sHt        |j                  d      r_|j                         s-t        j                  |j                  j                   d       n|}t        |t              re| j                  j                  D ]L  }t!        | j                  |      |j                  u s&|j                  ||j                  j                        } n |j#                  |      }||j                  _        d|j                  _        H | j                  j                  D ]  }t!        | j                  |      x}t        |t              s|n|j                  ||j                        }t        |d      s|||j                  k7  rm|dgt&        j)                         z   vr/t+        d	| d
| dt-        t&        j)                                      ||_        t        j                  d| d| d       t        |d      s|` y)a  
        Set the requested `attn_implementation` for this model.

        Args:
            attn_implementation (`str` or `dict`):
                The attention implementation to set for this model. It can be either a `str`, in which case it will be
                dispatched to all submodels if relevant, or a `dict` where keys are the sub_configs name, in which case each
                submodel will dispatch the corresponding value.
        rd  z does not support setting its attention implementation dynamically, because it does not follow the functional approach based on AttentionInterface (see https://huggingface.co/docs/transformers/en/attention_interface)Fr  _attn_was_changedTNr  r  z"` is not supported for zd. The only possible arguments are "eager" (manual attention implementation)or one of the following: z8We set the attention implementation for the sub-config `z` to `z` without finding the associated sub-model. For this reason we could not check if the model supports it. You may encounter undefined behavior.)r   r   r   r  r  r  rs  rc  r  r   r  r  r  rz   r  r  r  r  r  r  r  r   r   )r   rT  requested_implementationr  sub_implementationsubconfig_key	subconfigs          r   set_attn_implementationz'PreTrainedModel.set_attn_implementation  s    148  $((T[[-M-MN 	! $t{{'G'GG446~~../ 0\ \ ,0+U+U,E ,V ,( =U9  "	:I %y/:$$..$++2G2GG	 0 02EF !==?NN$..778 9` ` *B&!"5t<-1[[-D-D &M&t{{MBiFVFVV5H5L5L$193C3C3X3X6" 2 !&& *3)R)RSe)f&EWI$$B 6:	  2E"	:J "[[44 	8M$T[[-@@	M &&94@ -,00	@^@^_ #  	+>?*i.L.LL)'=T=_=_=a1aa(>?Q>RRjkxjy z88<=T=_=_=a8b7ce 
 ?QI;NNRS`Raaghzg{ |@ @ y*=>%79	8r   c                    t        |t              s|n%|j                  d| j                  j                        }|| j                  j                  k7  r"| j                  |      }|| j                  _        | j                         D ]  }|| ust        |t              s|j                  j                  | j                  j                  k7  sG|}t        |t              re| j                  j                  D ]L  }t        | j                  |      |j                  u s&|j                  ||j                  j                        } n |j                  |      }||j                  _         y)a  
        Set the requested `experts_implementation` for this model.

        Args:
            experts_implementation (`str` or `dict`):
                The experts implementation to set for this model. It can be either a `str`, in which case it will be
                dispatched to all submodels if relevant, or a `dict` where keys are the sub_configs name, in which case each
                submodel will dispatch the corresponding value.
        rd  N)r   r   r   r  r  r  r  r  rz   r  r  r  r  )r   rU  r  r  r  r  s         r   set_experts_implementationz*PreTrainedModel.set_experts_implementation  s>    4d; #'++B0S0ST 	! $t{{'J'JJ'+'T'TUm'n$;SDKK8  	WI %y/:$$..$++2G2GG &>"4d;)-)@)@ ""4;;>)BRBRR1G1K1K -y/?/?/W/W2. "" &/%Q%QRd%e"DV	  A)	Wr   c                    d }g }t               }d}| j                         D ]  }t        |t              rt	        |d      s 	 |j                         }|t	        |d      s@t        |      }||v rP|j                  |       |j                  |j                  |             d} || _        |r
|d   | _        |s-t        j                  | j                  j                    d       yy# t        $ r Y w xY w)	z
        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
        the model weights fixed.
        c                 &    |j                  d       y NT)requires_grad_)r  inputoutputs      r   make_inputs_require_gradszMPreTrainedModel.enable_input_require_grads.<locals>.make_inputs_require_grads3  s    !!$'r   Fr  Nregister_forward_hookTr   a   does not expose input embeddings. Gradients cannot flow back to the token embeddings when using adapters or gradient checkpointing. Override `get_input_embeddings` to fully support those features, or set `_input_embed_layer` to the attribute name that holds the embeddings.)r!  r  r   rz   r  r  r  r/  r  r  r  _require_grads_hooks_require_grads_hookrs  r  r  r   )r   r  hooksseen_modulesfound_embeddingsr  input_embeddingsembedding_ids           r   enable_input_require_gradsz*PreTrainedModel.enable_input_require_grads-  s   	( u lln 	$Fv7GFLb<c#)#>#>#@   'w7GI`/a./L|+\*LL)??@YZ[#%	$( %*!',QxD$>>**+ ,w w  % ' s   C**	C65C6c                 ~    t        | dd      }|sy|D ]  }|j                           g | _        t        | d      r| `yy)z4
        Removes the `_require_grads_hook`.
        r  Nr  )r  remover  r  r  )r   r  hooks      r   disable_input_require_gradsz+PreTrainedModel.disable_input_require_gradsY  sQ     4d; 	DKKM	 %'!4./( 0r   modalityc                 :   |dv rg d}n|dk(  rg d}n|ddg}nt        d|       |D ]  }t        | |      st        | |      c S  | j                  | urCt        | j                  d      r-| j                  j	                  |	      }|| j                  k7  r|S | S )
ai  
        Best-effort lookup of the *encoder* module. If provided with `modality` argument,
        it looks for a modality-specific encoder in multimodal models (e.g. "image_encoder")
        By default the function returns model's text encoder if any, and otherwise returns `self`.

        Possible `modality` values are "image", "video" and "audio".
        imagevideovision_towervisualvision_modelvision_encoderimage_toweraudio)audio_toweraudio_encoderspeech_encodertext_encoderencoderHUnnrecognized modality, has to be "image", "video" or "audio" but found get_encoderr  )r   r  r  r  r  )r   r  possible_module_namesr
  base_encoders        r   r  zPreTrainedModel.get_encoderh  s     ))$o! $V!%3Y$?!ghpgqrss) 	+DtT"tT**	+ ??$&74??M+R??666IL t.## r   c                 *   |dv rg d}|dk(  rddg}n|ddg}nt        d	|       |D ]  }t        | |      st        | ||        y | j                  | ur<t        | j                  d
      r| j                  j	                  ||       y|| _        yy)zS
        Symmetric setter. Mirrors the lookup logic used in `get_encoder`.
        r  r  r  r  r  Nr  r  r  set_encoderr  )r   r  rM  r  r  r%  )r   r  r  r  r
  s        r   r  zPreTrainedModel.set_encoder  s     ))$o!w%2O$D!%3Y$?!ghpgqrss) 	DtT"dG,	
 ??$&t6++Gh+G$
	 'r   c                     g d}|D ]  }t        | |      st        | |      c S  | j                  | ur0t        | j                  d      r| j                  j                         S | S )a  
        Best-effort lookup of the *decoder* module.

        Order of attempts (covers ~85 % of current usages):

        1. `self.decoder/self.language_model/self.text_model`
        2. `self.base_model`                  (many wrappers store the decoder here)
        3. `self.base_model.get_decoder()`    (nested wrappers)
        4. fallback: raise for the few exotic models that need a bespoke rule
        )language_model
text_modeldecodertext_decoderget_decoder)r  r  r  r  )r   r  r
  s      r   r  zPreTrainedModel.get_decoder  sf     !\) 	+DtT"tT**	+ ??$&74??M+R??..00 r   c                     g d}|D ]  }t        | |      st        | ||        y | j                  | ur:t        | j                  d      r| j                  j                  |       y|| _        yy)zS
        Symmetric setter. Mirrors the lookup logic used in `get_decoder`.
        )r  r  r  Nset_decoder)r  rM  r  r  r%  )r   r  r  r
  s       r   r  zPreTrainedModel.set_decoder  sm    
 !L) 	DtT"dG,	
 ??$&t6++G4$
	 'r   c           	      $   t        | j                  d      r| j                  j                  xs d}nt        | j                  d      r| j                  j                  }nRt        | j                  d      r| j                  j                  }n%t        | j                  j                         dd      }t        |t        j                  t        j                  t        j                  t        j                  t        j                  t        j                  f      r\t        |dd      "t        j                   |j"                  d|       |j$                   t        j&                  |j$                         yyt        |t        j(                        rtt        j                   |j"                  d|       |j*                  Et        |j"                  d	d
      s-t        j&                  |j"                  |j*                            yyyt        |t        j,                        r|j/                          yt        |t        j0                  t        j2                  t        j4                  t        j6                  f      s0d|j8                  j:                  v sd|j8                  j:                  v rt        |dd      t        j<                  |j"                         t        |dd      t        j&                  |j$                         t        |dd      ^t        j&                  |j>                         t        j<                  |j@                         t        j&                  |jB                         yyd|j8                  j:                  v rt        |d      r|jD                  dk7  rtF        |jD                     n|jH                  } ||j                        \  }}t        jJ                  |jL                  |       t        jJ                  |jN                  |       yyy)ad  
        Initialize the weights. This is quite general on purpose, in the spirit of what we usually do. For more complex
        initialization scheme, it should be overridden by the derived `PreTrainedModel` class. In case a model adds an explicit
        `nn.Parameter`, this method should also be overridden in order to initialize it correctly.
        initializer_rangeg{Gz?init_stdinitializer_factorweightNg        meanstd_is_hf_initializedF	LayerNormRMSNormbiasrunning_meanRotaryEmbeddingoriginal_inv_freqdefault)(r  r  r  r  r  r  get_text_configr   r   LinearConv1dConv2dConv3dConvTranspose1dConvTranspose2dinitnormal_r  r  zeros_r  padding_idxMultiheadAttention_reset_parameters	GroupNormBatchNorm1dBatchNorm2dBatchNorm3dr  r   ones_r  running_varnum_batches_tracked	rope_typerG   compute_default_rope_parameterscopy_inv_freqr  )r   r  r  rope_fnbuffer_valuer  s         r   _init_weightszPreTrainedModel._init_weights  s    4;; 34++//74CT[[*-++&&CT[["67++00C $++5579LdSCfryy"))RYY		2K]K]_a_q_qrsvx.:V]]#>{{&FKK( '-LLSc:!!-gfmmMach6iFMM&*<*<=> 7j- 5 56$$& vbnnbnnbnn]^f..777F,,555 vx.:

6==)vvt,8FKK(v~t4@F//0

6--.F667 A
 &"2"2";";;Pc@d ##y0 $F$4$45;; 
 &fmm4OL!JJv5JJv//> Ae;r   c                     t        |dd      ryt        |dd      x}.t        |dd      r!t        |j                               sd|_        y| j	                  |       d|_        y)zM
        Initialize the weights if they are not already initialized.
        r   FNr  T)r  r   named_buffersr   r"  )r   r  r  s      r   _initialize_weightsz#PreTrainedModel._initialize_weights	  si     6/7 vx66VC 4e<--/0(,F%6"$(!r   c                     t        t        j                  j                  d      s"d }|t        j                  j                  _        | j	                  | j
                         y)a  
        This is equivalent to calling `self.apply(self._initialize_weights)`, but correctly handles composite models.
        This function dynamically dispatches the correct `init_weights` function to the modules as we advance in the
        module graph along the recursion. It can handle an arbitrary number of sub-models. Without it, every composite
        model would have to recurse a second time on all sub-models explicitly in the outer-most `_init_weights`, which
        is extremely error prone and inefficient.
        smart_applyc                     | j                         D ]?  }t        |t              r|j                  |j                         /|j                  |       A  ||        | S r   )childrenr   rz   r'  r%  )r   fnr  s      r   r'  z7PreTrainedModel.initialize_weights.<locals>.smart_apply#	  sQ    "mmo /F!&/:**6+E+EF**2./ 4r   N)r  r   r   r  r'  r%  )r   r'  s     r   initialize_weightsz"PreTrainedModel.initialize_weights	  sC     uxx6 +6EHHOO' 	112r   r"  c                    |ri }| j                  d      D ]k  \  }}t        |t              s|j                  d      }|dk7  r-|j	                         D ci c]  \  }}| d| | d|  }}}|j                  |       m |S | j                  }t        | j                  dd      }	|	si S |i S t        j                  d      t        fd|j                         |j                         z  D              r|j                         S i }| j                  d      D 
ch c]  \  }}
|	 c}
}| j!                  d      D 
ch c]  \  }}
|	 c}
}z  }|j	                         D ]  \  d	z   d	z   t#        t%        fd
|            }t#        t%        fd|            }t'        |      dkD  r(t'        |      dkD  rt'        |      t'        |      z  dk7  rt)        d d d| d|       t+        |t-        |            D ]%  \  }}||j                         v r	||   ||<   !|||<   '  |S c c}}w c c}
}w c c}
}w )a
  
        Return the expanded tied weight keys (in case they contain modules or regex patterns) for only the current
        model, or recursively for all submodels if `all_submodels=True` (i.e. it will re-check the config values for all
        submodels).

        For almost all models, we only require to tie the embeddings, so the model has an internal property
        `_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}`. In this case, the mapping is already
        "expanded", i.e. it already contains full parameters, and this function will simply return a copy of the property.
        For more complex patterns, e.g. for `DFineForObjectDetection`, we have the following attribute
        ```
        _tied_weights_keys = {
            r"bbox_embed.(?![0])\d+": "bbox_embed.0",
            r"class_embed.(?![0])\d+": "class_embed.0",
            "model.decoder.class_embed": "class_embed",
            "model.decoder.bbox_embed": "bbox_embed",
        }
        ```
        In this case, the function looks up all the model's parameters and buffers, and matches all the params,
        returning the following:
        ```
        {
            'bbox_embed.1.layers.0.bias': 'bbox_embed.0.layers.0.bias',
            'bbox_embed.1.layers.0.weight': 'bbox_embed.0.layers.0.weight',
            'bbox_embed.1.layers.1.bias': 'bbox_embed.0.layers.1.bias',
            'bbox_embed.1.layers.1.weight': 'bbox_embed.0.layers.1.weight',
            'bbox_embed.1.layers.2.bias': 'bbox_embed.0.layers.2.bias',
            'bbox_embed.1.layers.2.weight': 'bbox_embed.0.layers.2.weight',
            'bbox_embed.2.layers.0.bias': 'bbox_embed.0.layers.0.bias',
            'bbox_embed.2.layers.0.weight': 'bbox_embed.0.layers.0.weight',
            ...
            'class_embed.1.bias': 'class_embed.0.bias',
            'class_embed.1.weight': 'class_embed.0.weight',
            'class_embed.2.bias': 'class_embed.0.bias',
            'class_embed.2.weight': 'class_embed.0.weight',
            ...
            'model.decoder.class_embed.0.bias': 'class_embed.0.bias',
            'model.decoder.class_embed.0.weight': 'class_embed.0.weight',
            'model.decoder.class_embed.1.bias': 'class_embed.0.bias',
            'model.decoder.class_embed.1.weight': 'class_embed.0.weight',
            ...
            'model.decoder.bbox_embed.0.layers.0.bias': 'bbox_embed.0.layers.0.bias',
            'model.decoder.bbox_embed.0.layers.0.weight': 'bbox_embed.0.layers.0.weight',
            'model.decoder.bbox_embed.0.layers.1.bias': 'bbox_embed.0.layers.1.bias',
            'model.decoder.bbox_embed.0.layers.1.weight': 'bbox_embed.0.layers.1.weight',
            ...
        }
        ```
        i.e. all the parameters matching the regex and modules patterns in `_tied_weights_keys`
        F)remove_duplicater!  rd  r  tie_word_embeddingsz ^[A-Za-z0-9_\.]+(weight)|(bias)$c              3   @   K   | ]  }j                  |        y wr   )r?  )r+  r   common_case_regexs     r   r-  zAPreTrainedModel.get_expanded_tied_weights_keys.<locals>.<genexpr>	  s     _a &&q)_s   ^c                 0    t        j                  |       S r   r(  )xsource_names    r   <lambda>z@PreTrainedModel.get_expanded_tied_weights_keys.<locals>.<lambda>	      BIIk14M r   c                 0    t        j                  |       S r   r(  )r3  target_names    r   r5  z@PreTrainedModel.get_expanded_tied_weights_keys.<locals>.<lambda>	  r6  r   r   zAThere is an issue with your definition of `tie_weights_keys` for :z. We found z to tie into )r  r   rz   r)  r.  r+  r  r  r  r)  compileallr   r   r&  r  r$  r2  filterr   r   zipr   )r   r"  expanded_tied_weightsprefixr  submodel_tied_weightsr   r/  tied_mappingr.  r  all_param_namessource_paramstarget_paramstarget_nsource_nr0  r4  r8  s                   @@@r   r)  z.PreTrainedModel.get_expanded_tied_weights_keys2	  s   d $&!%)%7%7%7%O H!	i9,5,T,Tch,T,i)|I^IdIdIf1AEAvhasOxq_<1- 1 *001FGH )(.. &dkk3H%P"I!I JJ'JK_<3D3D3FI\I\I^3^__$$&& !#)-)>)>PU)>)VWA1W,,e,D[
!QA[
 
 )5(:(:(< 	?$K+K+K"6*M#_`M"6*M#_`M&*=)A-}%M(::a? WXcWddefqer s  -mM?L 
 '*-}9M&N 	?"( 499;;6KH6U)(3 7?)(3	?!	?6 %$o12 X [
s   H8$H>ITmissing_keysrecompute_mappingc                    |s| j                   }n| j                  d      }t        |j                               }t	        |      D ]  \  }\  }}|d}||v}||v}	|r:|	r8t
        j                  d| d| d       | j                   j                  |       R|s|	r||}}nB|s@|	s>||dz   d D ]  \  }
}||k(  s|
|v}|s|
} n d}t
        j                  d	| d| d
       | j                  |      }d|v r'|j                  dd      \  }}| j                  |      }n|}| }t        |||       | j                  ||       |s|j                  |        y)aM  
        Tie the model weights. If `recompute_mapping=False` (default when called internally), it will rely on the
        `model.all_tied_weights_keys` attribute, containing the `{target: source}` mapping for the tied params.
        If `recompute_mapping=True`, it will re-check all internal submodels and their config to determine the params
        that need to be tied. This is the default when `model.tie_weights()` is called on its own, outside of
        `__init__`, and `from_pretrained`, in case the config values were changed somewhere.

        Note that during `from_pretrained`, tying is *symmetric*: if the mapping says "tie target -> source" but
        `source` is missing in the checkpoint while `target` exists, we *swap* source and target so we can still
        tie everything to the parameter that actually exists.
        Tr!  NzDThe tied weights mapping and config for this model specifies to tie z to z, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warningr   FzYThis checkpoint seem corrupted. The tied weights mapping for this model specifies to tie zk, but both are absent from the checkpoint, and we could not find another related tied weight for those keysr  )r$  r)  r   r.  	enumeraters  rc  r  get_parameter_or_bufferrT  get_submodulerM  _adjust_biasdiscard)r   rG  rH  r0  itarget_param_namesource_param_nameremove_from_missingsource_is_theretarget_is_theretarget_backupsource_backuptarget_backup_is_theresource_paramparent_namer
  rN  s                    r   tie_weightszPreTrainedModel.tie_weights	  s    !22I;;$;OI*+	9B99M ;	85A5!#4'&*#"3<"G"3<"G #NN^_p^qqu,- .pp ..223DE(_;LN_'8%(8A!a%'8J 4} ),==5B,5V2  64A 1 %  /4+w016G5H I__  778IJL''$5$<$<S!$D!T++K8(FD,/fl3',?$$%67w;	8r   c                    t        |dd       t        |d      rz|j                  j                  }t        j
                  j                  |j                  j                  d|d   |j                  j                  d   z
  fdd      |j                  _        t        |d      rt        |d      r|j                  |_
        y y y )Nr  r  r   constantout_featuresnum_embeddings)r  r  r  r  r   
functionalpadr  datar^  r]  )r   output_embeddingsr  weight_shapes       r   rM  zPreTrainedModel._adjust_bias	  s    $fd3?GL]_gDh,3399L*,--*;*;!&&++LO&7&<&<&B&B1&EEF	+""' $n5'BRTd:e-=-L-L* ;f5r   new_num_tokenspad_to_multiple_ofmean_resizingc                    | j                  |||      }|||S t        | d      xr | j                  du}t               rP|sNddl}|j
                  j                  |j                  d      5  |j                  j                  d   }ddd       n|j                  j                  d   }| j                  j                         _        || _        | j                          |S # 1 sw Y   AxY w)a$	  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        Nr   r   modifier_rank)_resize_token_embeddingsr  r   r+   rY  rZ  GatheredParametersr  r  r  r  
vocab_sizerZ  )r   rd  re  rf  model_embedsr   rY  rl  s           r   resize_token_embeddingsz'PreTrainedModel.resize_token_embeddings 
  s    H 44^EWYfg!&8&@ t^4V9J9JRV9V%'22<3F3FVZ2[ :)0066q9
: : &,,2215J 4>##%0$ 	: :s   -C!!C*c                    | j                         }| j                  ||||      }t        |d      r|j                  }t	        ||       |j
                  j                  }|j                  |       | j                  |       t        | d      xr | j                  d u}|st               rP|sNdd l}	|	j                  j                  |j
                  d       5  |j
                  j                  d   }d d d        n|j
                  j                  d   }| j                         | j                         }
t!        |
t"        j$                  j&                        r| j                  |
||      }n| j)                  |
||      }t        |
d      r|
j                  }t	        ||       |
j
                  j                  }|j                  |       | j+                  |       | j                         S # 1 sw Y   xY w)N_hf_hookr   r   rh  )rf  )r  _get_resized_embeddingsr  rp  rs   r  rJ  r  r  r   r+   rY  rZ  rk  r  r  r   r   r   r  _get_resized_lm_headr  )r   rd  re  rf  old_embeddingsr  r  old_embeddings_requires_gradr   rY  old_lm_headnew_lm_headold_lm_head_requires_grads                r   rj  z(PreTrainedModel._resize_token_embeddings;
  s   22455N,>
 >:.!**D~t4'5'<'<'J'J$%%&BC!!.1t^4V9J9JRV9V ))+L ^^66~7L7L\`6a D%3%:%:%@%@%CND D "0!6!6!<!<Q!? %%'3446K+uxx'9'9:"::;fs:t"77^cp7q{J/"++";5(3(:(:(H(H%&&'@A&&{3((**'D Ds   G++G4rs  c           	         |It        |t              st        d| d      ||j                  j                  d   }||z   dz
  |z  |z  }nt
        j                  d| d       ||S t        | d      xr | j                  du}t               rT|sRddl
}|j                  j                  |j                  d	      5  |j                  j                         \  }}ddd       n|j                  j                         \  }}|k(  rt               s|S t        |t        j                        s:t!        d
t#        |       dt        j                   dt        j                   d      t        j                  ||j                  j$                  |j                  j&                        }	||kD  r|s| j)                  |	       n||kD  r|rt
        j+                  d       ||z
  }
t               rL|sJddl
}|j                  j                  |j                  gd	      5  | j-                  ||	||
       ddd       n| j-                  ||	||
       t/        ||      }t               r|sddl
}|j                  |	j                  g}|j                  j                  |d	      5  |j                  j0                  d|ddf   |	j                  j0                  d|ddf<   ddd       n<|j                  j0                  d|ddf   |	j                  j0                  d|ddf<   t               r|sddl
}|j                  |	j                  g}|j                  j                  |d	      5  |	j                  |_        |	j                  j0                  j                  d   |_        |j4                  |dz
  |j4                  k  rd|_        ddd       |S |	j                  j0                  |j                  _        |	j                  j0                  j                  d   |_        |j4                  |dz
  |j4                  k  rd|_        |S # 1 sw Y   IxY w# 1 sw Y   xY w# 1 sw Y   FxY w# 1 sw Y   |S xY w)a	  
        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
        initialized vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_embeddings (`torch.nn.Embedding`):
                Old embeddings to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the embedding matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html


        Return:
            `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
            `new_num_tokens` is `None`
        Nz5Asking to pad the embedding matrix to a multiple of `z@`, which is not and integer. Please make sure to pass an integerr   r   zYou are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be a.  . This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tcr   rh  zOld embeddings are of type , which is not an instance of zj. You should either use a different resize function or make sure that `old_embeddings` are an instance of r  r  zThe new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)r   r   r   r  r  rs  rt  r  r   r+   rY  rZ  rk  r   r   r  r  r0  r   r   r"  r  (_init_added_embeddings_weights_with_meanr  ra  r^  r  )r   rs  rd  re  rf  r   rY  old_num_tokensold_embedding_dimr  added_num_tokensnparamss                r   rq  z'PreTrainedModel._get_resized_embeddingsb
  s   V )0#6 KL^K_  ``  a  %!/!6!6!<!<Q!?-0BBQFK]]assNKK&&4%5 6DD !!!t^4V9J9JRV9V%'22>3H3HX\2] Q4B4I4I4N4N4P1 1Q Q 1?0E0E0J0J0L-N-^+4N4P!!.",,7-d>.B-CCabdbnbnao pLL>$  !((// ''--	
 N*=~.n, =  .>)+L ^^668M8M7N^b6c AA&HX 
 =="NNDT /%'$++^-B-BCF2262K V4B4I4I4N4NrPQrSTu4U%%**2A2q51V V 1?0E0E0J0J2A2q50QN!!&&rr1u-
 &'$++^-B-BCF2262K 6(6(=(=%0>0E0E0J0J0P0PQR0S- "--9~PQ?QUcUoUo>o15N.6  *8)>)>)C)CN!!&,:,A,A,F,F,L,LQ,ON)))5>A;MQ_QkQk:k-1*uQ Q\ $V V6 s1   <P?Q=Q"AQ&?Q	QQ#&Q0ru  
transposedc           	         ||S t        | d      xr | j                  du}t               r~|s|ddl}|j                  j                  |j                  d      5  |s|j                  j                         n'|j                  j                         j                         \  }}ddd       nG|s|j                  j                         n'|j                  j                         j                         \  }}|k(  rt               s|S t        |t        j                        s:t        dt        |       dt        j                   dt        j                   d      |s|fn|f}	|j                  du}
t        j                  |	|
|j                  j                  |j                  j                   d	}||kD  r|s| j#                  |       n||kD  r|rt$        j'                  d
       ||z
  }t               rw|suddl}|j                  g}|
r||j                  gz  }|j                  j                  |d      5  | j)                  ||||||       |
r| j+                  |||       ddd       n+| j)                  ||||||       |
r| j+                  |||       t-        ||      }t               rq|soddl}|j                  |j                  |j                  |j                  g}|j                  j                  |d      5  | j/                  |||||
       ddd       |S | j/                  |||||
       |S # 1 sw Y   JxY w# 1 sw Y   xY w# 1 sw Y   |S xY w)a  
        Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
        vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_lm_head (`torch.nn.Linear`):
                Old lm head liner layer to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the linear matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
                to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
                vocab_size` else `vocab_size, lm_head_dim`.
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
            `None`
        Nr   r   rh  z#Old language model head is of type ry  zg. You should either use a different resize function or make sure that `old_lm_head` are an instance of r  )r  r   r   a  The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)r  r   r+   rY  rZ  rk  r  r   r   r   r   r	  r  r0  r  r   r   r"  rs  r  %_init_added_lm_head_weights_with_mean"_init_added_lm_head_bias_with_meanr  !_copy_lm_head_original_to_resized)r   ru  rd  r  rf  r   rY  r{  old_lm_head_dimnew_lm_head_shapehas_new_lm_head_biasrv  r}  r  num_tokens_to_copys                  r   rr  z$PreTrainedModel._get_resized_lm_head   sr   H !t^4V9J9JRV9V%'22;3E3EUY2Z 5?K&&++-[EWEWEYEYE[E`E`Eb 0  2<""'')ASASAUAUAWA\A\A^ ,NO ^+4N4P+ryy15d;6G5HHfgigpgpfq rII;a!  FP_n=VdfuUv*//t; ii%%%,,$$**	
 N*={+n, =  .>)+L %,,-'{//00F^^66vT6R l>>#[/>Sceo ,??[Zjkl l ::o~O_ak (;;KVfg @%'!((+*:*:K<N<NP[P`P`aF2262K 66.@*Nb 	 22[*<jJ^ i nl l( s%   AL02,L=9M	0L:=M	Mc                    |j                   j                  j                  t        j                        }t        j
                  |d      }||z
  }|j                  |z  |z  }d}	t        j                  j                  |	|z        j                         }
|
rt        j                  j                  j                  ||	|z        }|j                  |f      j                  |j                   j                        |j                   j                  d|z  d d d f<   y |d d d f   j!                  |d      j                  |j                   j                        |j                   j                  d|z  d d d f<   y )Nr   r  &.>)covariance_matrix)sample_shaper   r   )r  ra  r   r   r   r  Tr   positive_definitecheckr;  distributionsmultivariate_normalMultivariateNormalsampler   r  )r   rs  r  r{  r}  old_embeddings_weightmean_embeddingsold_centered_embeddings
covarianceepsilonis_covariance_psddistributions               r   rz  z8PreTrainedModel._init_added_embeddings_weights_with_mean  sc    !/ 5 5 : : = =emm L**%:C"7/"I,..1HH>Y
 '99??*@TUYY[ ..BBUU7Z3G V L FREXEX.0 FY Fb&&,,- !!&&r,<'<'>'AB  a(//0@!DGGH]H]HcHcd !!&&r,<'<'>'ABr   c                    |r^|j                   j                  j                  |j                   _        |j                   j                  j                  |j                   _        | j                  ||||       |r_|j                   j                  j                  |j                   _        |j                   j                  j                  |j                   _        y y r   )r  ra  r  rz  )r   ru  rv  r  r{  r}  r  s          r   r  z5PreTrainedModel._init_added_lm_head_weights_with_mean  s     &1&8&8&=&=&?&?K#&1&8&8&=&=&?&?K# 	55k;P^`pq&1&8&8&=&=&?&?K#&1&8&8&=&=&?&?K# r   c                 h   t        j                  |j                  j                  dt         j                        }t        j
                  |j                  j                  d      j                  t         j                        }|j                  j                  d|z  d  j                  |d|z         y )Nr   )r  r   r  r   r  r  )r   r  r  ra  r   r  r   r  )r   ru  rv  r}  	bias_meanbias_stds         r   r  z2PreTrainedModel._init_added_lm_head_bias_with_mean  s    JJ{//441EMMR	99[--22;>>u}}Mb#3356>>ISWZbSb>cr   c                 `   |s=|j                   j                  d |d d f   |j                   j                  d |d d f<   n<|j                   j                  d d d |f   |j                   j                  d d d |f<   |r1|j                  j                  d | |j                  j                  d | y y r   )r  ra  r  )r   rv  ru  r  r  r  s         r   r  z1PreTrainedModel._copy_lm_head_original_to_resized  s     >I>P>P>U>UViWiViklVl>mK##$7%7$7$:;>I>P>P>U>UVWYlZlYlVl>mK##A':(:':$:;  9D9I9I9N9NObPb9cK!!"5#56  r   new_num_position_embeddingsc           	      |    t        d| j                   d| j                   d| j                  j                   d      )Nz4`resize_position_embeddings` is not implemented for B`. To implement it, you should overwrite this method in the class  in `modeling_.py`r  r  r   )r   r  s     r   resize_position_embeddingsz*PreTrainedModel.resize_position_embeddings  sH    !B4>>BR S226..1APTP^P^PiPiOjjnp
 	
r   c           	      |    t        d| j                   d| j                   d| j                  j                   d      )Nz1`get_position_embeddings` is not implemented for r  r  r  r  r   s    r   get_position_embeddingsz'PreTrainedModel.get_position_embeddings  sH    !??O P226..1APTP^P^PiPiOjjnp
 	
r   c                     t               t        j                  d      k7  r| j                          | j	                  d       y)z
        Initialize and tie the weights if needed. If using a custom `PreTrainedModel`, you need to implement any
        initialization logic in `_init_weights`.
        r   F)rH  N)r   r   r   r+  rZ  r   s    r   r,  zPreTrainedModel.init_weights  s5     675<<;OO##%51r   c                    | j                   s"t        | j                  j                   d      |ddi}t	        j
                  t        fi |}dt        j                  | j                        j                  v }|s| j                  d|       n;| j                  t        | j                  d             t        j                  d	       | j                  d
k(  }|xs t        | dd      }|r| j!                          yy)a  
        Activates gradient checkpointing for the current model.

        We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
        the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2

        Args:
            gradient_checkpointing_kwargs (dict, *optional*):
                Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
        z) does not support gradient checkpointing.Nuse_reentrantFr  T)enablegradient_checkpointing_funcr  V  You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.r  _hf_peft_config_loaded)r  r   r  r   	functoolsr   r   inspect	signature_set_gradient_checkpointingr  applyrs  rc  r  r  r  )r   gradient_checkpointing_kwargsr  _is_using_old_formatneeds_embedding_gradsenable_input_gradss         r   rL  z-PreTrainedModel.gradient_checkpointing_enable  s     33 7 788abcc(0-<e,D)&/&7&7
&dFc&d#  ''*;*;D<\<\*]*h*hh#,,DVq,rJJwt??tLMNNH
 !% 4 4 C2dgdD\^c6d
 ++- r   r  r  c                     d}t        | d      r|| _        || _        d}| j                         D ]  }t        |d      s||_        ||_        d}! |s"t	        | j
                  j                   d      y )NFrK  Tz is not compatible with gradient checkpointing. Make sure all the architecture support it by setting a boolean attribute `gradient_checkpointing` to modules of the model that uses checkpointing.)r  _gradient_checkpointing_funcrK  r  r   r  r   )r   r  r  is_gradient_checkpointing_setr  s        r   r  z+PreTrainedModel._set_gradient_checkpointing  s    (-% 4120KD-*0D',0)lln 	5Fv786Q306-04-		5 ->>**+ ,] ]  -r   c                 N   | j                   r{dt        j                  | j                        j                  v }|s| j                  d       n;t
        j                  d       | j                  t        | j                  d             t        | dd      r| j                          yy)zK
        Deactivates gradient checkpointing for the current model.
        r  F)r  r  r  r  N)r  r  r  r  r  rs  rc  r  r   r  r  )r   r  s     r   gradient_checkpointing_disablez.PreTrainedModel.gradient_checkpointing_disable  s     // $+g.?.?@`@`.a.l.l#l '000>L 

74#C#C5QR4159,,. :r   c                 B    t        d | j                         D              S )zT
        Whether gradient checkpointing is activated for this model or not.
        c              3   P   K   | ]  }t        |d       xr |j                     yw)rK  N)r  rK  )r+  ms     r   r-  z<PreTrainedModel.is_gradient_checkpointing.<locals>.<genexpr>3  s'     mYZ7167TA<T<TTms   $&)r3  r  r   s    r   is_gradient_checkpointingz)PreTrainedModel.is_gradient_checkpointing.  s    
 m^b^j^j^lmmmr   save_directoryis_main_processr   push_to_hubmax_shard_sizerR  r`  save_peft_formatsave_original_formatc
           	          |||
d<   t        | dd      }t        | dd      }|duxr" t        |t              xr |j                         }|'|s%|s#t	        d|j
                  j                   d      | j                  t        d      st        d	      t        j                  j                  |      rt        j                  d
| d       yt        j                  |d       |r|
j!                  dd      }|
j!                  d|j#                  t        j                  j$                        d         }|
j!                  dd      }t'        |fddi|
j(                  }| j+                  |      }i }||j-                  |       \  }}d|d<   t/        |       }|j0                  }t3        |      j#                  d      d   |j4                  _        |j6                  j8                  j;                  d      g|j4                  _        | j>                  tA        | || j4                         |r|s|j4                  jC                  |       | jE                         r|jF                  jC                  |       |rt        jI                  d       |jK                  |      }|r9t        jI                  d       i }|jM                         D ]  \  }}||d| <    |}| jO                         }tQ        |      dkD  rt	        d      |d   }| jR                  |   }|jC                  |       ||jU                         }d}tW        | d      r~tQ        tY        | jZ                  j]                                     dkD  rOd | jZ                  j]                         v sd!| jZ                  j]                         v rd}t_        j`                  d"       tb        r4td        jf                  jh                  jj                  D ]  \  }} ||      } | jl                  | jl                  D ]
  }||v s||=  | j                  ,to        || jp                  | jr                  | j                        }tu        ||      }|	r|stw        ||      }|stx        }t{        ||      }nt|        }|j                  d#d$      j                  d%d&      }t        |||'      } d}!| j                  r+d(| j                         i| j                  | j                  d)}!t        j                  |      D ]  }"t        j                  j                  ||"      }#|j                  d#d*      j                  d%d*      }$|"j                  d#d*      j                  d%d*      }%t        j                  d+      }&|"j                  |$      st        j                  j                  |#      s|"| j                  vs|s|&j                  |%      t        j                  |#        t        j                  | j                  jM                         d,-      D ]  \  }'}(t        j                  j                  ||'      }"i })|(D ]M  }*|j!                  |*      }+|r%|+j                  j                  d.k(  rt        ||*      }+|+j                         |)|*<   O t        |)|"|/       ~) |!9t        j                  j                  ||      },t        jI                  d0|,        nt        }-t        j                  j                  |t{        |-|            }-t        |-d1d23      5 }.t        j                  |!d4d5      d6z   }/|.j                  |/       ddd       t        jI                  d7| d8tQ        | j                         d9|- d       |r_t        | j                  |:      }0|0j                  t        j                  j                  |d;             | j                  |||<       yy# 1 sw Y   xY w)=a  
        Save a model and its configuration file to a directory, so that it can be re-loaded using the
        [`~PreTrainedModel.from_pretrained`] class method.

        Arguments:
            save_directory (`str` or `os.PathLike`):
                Directory to which to save. Will be created if it doesn't exist.
            is_main_process (`bool`, *optional*, defaults to `True`):
                Whether the process calling this is the main process or not. Useful when in distributed training like
                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
                the main process to avoid race conditions.
            state_dict (nested dictionary of `torch.Tensor`):
                The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
                save parts of the model or if special precautions need to be taken when recovering the state dictionary
                of a model (like when using model parallelism).
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            max_shard_size (`int` or `str`, *optional*, defaults to `"50GB"`):
                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).

                <Tip warning={true}>

                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
                which will be bigger than `max_shard_size`.

                </Tip>

            variant (`str`, *optional*):
                If specified, weights are saved in the format model.<variant>.safetensors.
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `hf auth login` (stored in `~/.huggingface`).
            save_peft_format (`bool`, *optional*, defaults to `True`):
                For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
                keys of the state dict of adapters needs to be prepended with `base_model.model`. Advanced users can
                disable this behaviours by setting `save_peft_format` to `False`.
            save_original_format (`bool`, *optional*, defaults to `True`):
                For backward compatibility with the previous versions of `transformers` you can save the checkpoint with
                its reverse mapping. The reverse mapping needs to exists even if the model was loaded from a None legacy
                checkpoint.
            kwargs (`dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        Nr`  r  Fr   zThe model is quantized with z and is not serializable - check out the warnings from the logger on the traceback to understand the reason why the quantized model is not serializable.z0.31.4z[Saving a model with tensor parallelism requires `huggingface_hub` version 0.31.4 or higher.zProvided path (z#) should be a directory, not a fileT)exist_okcommit_messagerepo_idr   	create_prr  r   formatr  r   FSDP)r  zhDetected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved.)r   zTo match the expected format of the PEFT library, all keys of the state dict of adapters will be prepended with `base_model.model`.zbase_model.model.zMultiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`r   hf_device_mapr   diskz}Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (50GB default)z.binz{suffix}.binr   z{suffix}.safetensors)filename_patternr  total_parameters)rq  
weight_maprd  z(.*?)-\d{5}-of-\d{5}zWriting model shards)descr   )rq  zModel weights saved in wr  r  r  )indent	sort_keys
z:The model is bigger than the maximum size per checkpoint (z) and is going to be split in z^ checkpoint shards. You can find where each parameters has been saved in the index located at )r`  z	README.md)r  r`  r  )]r  r   rI   is_serializabler   quantization_configquant_method_tp_sizerk   ro  r   rU  rp  rs  errormakedirsr  splitsepr   r  _get_files_timestampsget_state_dict_and_metadataunwrap_modelr   r   r  r  r   r  architectures_auto_classr&   save_pretrainedr  r  rt  get_adapter_state_dictr.  active_adaptersr   peft_configr   r  r!  r  r   r@  rA  IS_SAGEMAKER_MP_POST_1_10smpstatemodule_managertranslate_functionsr  r@   r  _device_meshrG  r$   rP   rV  rM   r>  r   ru  r  rq  tensor_to_filenamelistdirro  r)  r:  r  filename_to_tensors	fullmatchr  re   tqdmr   r0  r3   
contiguoussafe_save_filerO   r  jsondumpswriteri   r  save_upload_modified_files)1r   r  r  r   r  r  rR  r`  r  r  rm  r  r   quantization_serializabler  r  r  files_timestampsrq  model_to_saver   peft_state_dictkeyr  active_adaptercurrent_peft_configis_offloaded	smp_to_hfr  
ignore_keyrQ  r  state_dict_splitindexrx  full_filenameweights_no_suffixfilename_no_suffixreg
shard_filetensor_namesshard_state_dicttensor_namer   path_to_weightssave_index_filer   content
model_cards1                                                    r   r  zPreTrainedModel.save_pretrained5  s   v #F7O!(/G!Ot^T:$qL+)NqS_SoSoSq 	" #,BKd.|/O/O/\/\.] ^u u  ==$-PQY-Zm  77>>.)LL?>*::]^_
NT2#ZZ(8$?NjjN,@,@,Mb,QRG

;6I!'CDCFCKKG#99.I##/#K#KD#Q J! %T* ##%(Z%5%5c%:1%=" /<.E.E.N.N.[.[\b.c-d* 't^DKKH )$$44^D  "//??O%~ +AAZAX
#KK ^ ')O&0&6&6&8 K
UEJ*;C5(ABK!0J!%!5!5!7~&*$u  "0!2&*&6&6~&F##33NC &113J D/*C**113459$,,33554CUCUC\C\C^9^LMM: % #		 8 8 L L 3	1&z2
3 ''3":: /
+":./
 ==$3JtO`O`bfbobopJ 9]S
  (>1-LJ &,L'g>L4L'//GOOP^`vw=)9.
 &&/1D1D1FdJZJcJcd.AAE 

>2 	)HGGLLBM !- 4 4VR @ H HY[ \ "*!1!1&"!=!E!EnVX!Y**45C ##$56GGNN=1$4$H$HH#MM"45A		-(#	)( )000668?U)
 	!$J ww||NJ?H!+ D#4
  FMM$6$6&$@5m[QF 170A0A0C -D  +XI /	!2 = ggll><HOKK1/1BCD5O ggll><Y`;abOosW= !**U1EL ! KKL^L\ ] 0 D DEF G$$3#4A7 27DOOSXYJ OOBGGLLEF'' -# (  ! !s   +-_44_=c                     | j                   | j                   ng }|j                  dg       }t        |t              r|g}|D ]  }||vs|j	                  |        |r||d<   t        |   |i |S )NrN  )r  r   r   r   r  r  r  )r   rl  rm  rN  tags_kwargsrP  r  s         r   r  zPreTrainedModel.push_to_hubR  s}    "&//"=t2jj,k3'&-K 	!C$C 	! !F6Nw"D3F33r   c                     t        d | j                         D              }|r%t        d | j                         D              }||z   }|S )a  
        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2

        Arguments:
            return_buffers (`bool`, *optional*, defaults to `True`):
                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
        c              3   ^   K   | ]%  }|j                         |j                         z   ' y wr   r   r   r  s     r   r-  z7PreTrainedModel.get_memory_footprint.<locals>.<genexpr>n  s%     Ye%.."U%7%7%99Y   +-c              3   ^   K   | ]%  }|j                         |j                         z   ' y wr   r  )r+  bufs     r   r-  z7PreTrainedModel.get_memory_footprint.<locals>.<genexpr>p  s$     Y33<<>C,<,<,>>Yr  )sumr  buffers)r   return_buffersmemmem_bufss       r   get_memory_footprintz$PreTrainedModel.get_memory_footprintb  sB     YtGXYYY$,,.YYH.C
r   c                    t        | dd       t        j                  k(  rpddlm} t        |   |i | | j                         D ]F  }t        ||      st        |      dkD  r|d   }n|j                  dd      }|j                  |       H | S t        | dd       t        j                  k(  rt        | dd      rt        d      t        |   |i |S )	Nquantization_methodr   	HQQLinearr   rr  is_loaded_in_8bitFzCalling `cuda()` is not supported for `8-bit` quantized models.  Please use the model as it is, since the model has already been set to the correct devices.)r  rr   HQQhqq.core.quantizer&  r  rr  r  r   r   r   BITS_AND_BYTESr   )r   rl  rm  r&  r  r   r  s         r   rr  zPreTrainedModel.cudat  s    4.59K9O9OO3 GL$)&),,. (fi04y1}!%a!'Hf!=KK'( K 4.59K9Z9ZZt0%8 s  w|T,V,,r   c                    d|v }|s%|D ]   }t        |t        j                        sd} n t        | dd       t        j
                  k(  rsddlm} t        	| $  |i | | j                         D ]I  }t        ||      sd|v r|d   }n|d   }d|v r|d   }n|r}nd }|||_        |j                  |       K | S |r)t        | dd       t        j                  k(  rt        d      t        | dd       t        j                  k(  r0|rt        d      t        | d	d
      rAt!        d      s6t        d      t        | dd       t        j"                  k(  r|rt        d      t        	| $  |i |S )Nr   Tr$  r   r%  r   zBCasting a Quark quantized model to a new `dtype` is not supported.zYou cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `dtype` argument.r'  Fz0.48zsYou need to install `pip install bitsandbytes>=0.48.0` if you want to move a 8-bit model across devices using to().zYou cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `dtype` argument.)r   r   r   r  rr   r(  r)  r&  r  r   r  compute_dtyperr  QUARKr   r*  r[   GPTQ)
r   rl  rm  dtype_present_in_argsargr&  r  r   r   r  s
            r   r   zPreTrainedModel.to  s    !(6 1$ c5;;/,0)
 4.59K9O9OO3 GJ'',,. (fi06)!'!1!%a&( &w. # $ (/4,KK'#($ K WT3H$%OSeSkSk%kabb 4.59K9Z9ZZ$ P 
 t0%8AZ[aAb  J  T0$7;M;R;RR$ H  wz4*6**r   c                 L    t        | dd      rt        d      t        |   | S )Nr   Fz`.half()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r  r   r  halfr   rl  r  s     r   r2  zPreTrainedModel.half  s3    4/I 
 7<&&r   c                 L    t        | dd      rt        d      t        |   | S )Nr   Fz`.float()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r  r   r  floatr3  s     r   r5  zPreTrainedModel.float  s3    4/I 
 7=$''r   r   r   r   c                    t        || j                        t        j                         g}t	               rdd l}|si|sgt        j                  d       |j                  t        j                         |j                  j                  t                     t               g       |S |r.|j                  t        j                  d      t!               g       |S |j#                  t        j                  d             |S )Nr   rV  rW  r   )r   r   r  no_tie_weightsr+   rY  rs  rt  r  no_init_weightsrZ  r[  r*   r   r   r   r   r  )r  r   r   r   r\  rY  s         r   get_init_contextz PreTrainedModel.get_init_context  s     +5#,,?ATATAVW%'  (:^_$$,,.!++@P@R+S')  $$ell6&:<O<Q%RS    f!56r   c                    i }| j                   P|t        j                  k(  r=|j                  t        j                  | j                   t        j                               | j                  _|t        j                  t        j                  fv r=|j                  t        j                  | j                  t        j                               |S )z\Create the dtype_plan describing modules/parameters that should use the `keep_in_fp32` flag.)	r  r   rt  r+  r   fromkeysr   r  ru  )r   r   r   s      r   _get_dtype_planzPreTrainedModel._get_dtype_plan  s    

 %%1eu}}6LdmmD,F,FVW ,,8Uu}}V[VdVdFe=edmmD,M,Mu}}]^r   kernel_configc                 Z   |rt               st        d      ddlm} ddlm}  |        |dt        |t              rT|j                  |        |j                  |        |j                   } ||j                  |      5  d| _        ddd       yd| _        yd	| _        y# 1 sw Y   yxY w)
a  
        Set whether or not to use the `kernels` library to kernelize some layers of the model.
        Args:
            use_kernels (`bool`):
                Whether or not to use the `kernels` library to kernelize some layers of the model.
            kernel_config (`KernelConfig`, *optional*):
                The kernel configuration to use to kernelize the model. If `None`, the default kernel mapping will be used.
        zk`use_kernels=True` requires kernels>=0.9.0. Please install the latest version with `pip install -U kernels`r   )use_kernel_mappingr   )$register_kernel_mapping_transformersN)inherit_mappingTF)r`   r   kernelsr?  integrations.hub_kernelsr@  r   rT   sanitize_kernel_mappingcreate_compatible_mappinguse_local_kernelkernel_mappinguse_kernels)r   rH  r=  r?  r@  rA  s         r   set_use_kernelszPreTrainedModel.set_use_kernels  s     ')  B  3V02(Z|-T55d; 77=
 '4&D&D"D'(D(DVef ,'+D$, , $( $D, ,s    B!!B*rb  )	r  r\  r   r]  r_  r`  ra  r   r   r  r   r\  r   r]  r_  ra  r   r   c       	         .   |j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  d	d      }|j                  d
d      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  di       xs i j                         }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      } |j                  dd      }!|j                  dd      }"|j                  dd      }#|j                  dd      }$|j                  dd      }%|j                  dd      }&|j                  dd      }'|"| d } d!D ]  }(|j                  |(d      }) |||n|}|d }t               r|sd"}|||||||d#}*i |*d$|i}+|||t        d%      |d k(  r>t	        t
        j                  j                  d&d'            rt        j                  d(       | |!t        | |!|#|)      \  }}#}!|t               st        d*      |i }t        ||+fi |\  },}}t        |      }d+d,|d-}-|||-d.<   t        |t              sU||n|}. | j                   j"                  |.fd"|||d/|*|\  }}/d|/v r|/j                  d       |/j                  d|      }n$t        j$                  |      }|}/t'        |d|      }||+d$<   d0|v r|j                  d0      |_        d1|v r|j                  d1      |_        t-        ||||
|-      \  }0}}|r@|0t        d2      |1t        |t.              rd3|j1                         v sd3|v rt3        d4      |&|%st        j5                  d5       d"}%t7        ||||	|+|-| j8                  dut'        |d6d      7      \  }1}2|0du}3|r=d8d9lm}4 t?        j@                  d:      5   | |      }5ddd        |4|1d;   d"5<      d=   }tC        ||1||2||
|0      \  }}||_"        | jG                  ||3tH              }6t        j$                  |      }tK        |6      5   | |g|i |/}7|0|0jM                  |7|||1|%>       ddd       7jO                  |      }8tQ        |7|'|0      }9tR        r|#tU        |7| |"|#|!      }7|tW        |7|||0      }tY        |||2|||||8|0|#|
|9|	|*?      }:| j[                  |7||1|:      \  };}<| j]                  |7|:|;      };|7j_                          |7ja                  |%|&       |7jc                         r*te        |7d@      r|s |7jf                  ||||fi |*d|$i| |5ti        tk        |j1                                     d8kD  rtm        |7|0|||<|       |0|0|7_7        |0jq                  |7       |,|||dA<   |7js                  |,||:|B      };|r|7|;ju                         fS |7S # 1 sw Y   xY w# 1 sw Y   xY w)Ca:  
        Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you should first set it back in training mode with `model.train()`.

        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
        task.

        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
        weights are discarded.

        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
                      arguments `config` and `state_dict`).
            model_args (sequence of positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            config (`Union[PreTrainedConfig, str, os.PathLike]`, *optional*):
                Can be either:

                    - an instance of a class derived from [`PreTrainedConfig`],
                    - a string or path valid as input to [`~PreTrainedConfig.from_pretrained`].

                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:

                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            state_dict (`dict[str, torch.Tensor]`, *optional*):
                A state dictionary to use instead of a state dictionary loaded from saved weights file.

                This option can be used if you want to create a model from a pretrained configuration but load your own
                weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                checkpoint with 3 labels).
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (i.e., do not try to download the model).
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `hf auth login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.

                <Tip>

                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.

                </Tip>
            attn_implementation (`str`, *optional*):
                The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)), or `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.

                Accept HF kernel references in the form:
                  <namespace>/<repo_name>[@<revision>][:<kernel_name>]

                - <namespace> and <repo_name> are any non-"/" and non-":" sequences.
                - "@<revision>" is optional (branch, tag, or commit-ish), e.g. "@main", "@v1.2.0", "@abc123".
                - ":<kernel_name>" is optional and selects a function inside the kernel repo.
                - Both options can appear together and in this order only: @revision first, then :kernel_name.
                - We intentionally allow a leading "<wrapper>|" prefix (e.g., "flash|...") because the code
                  strips it before loading; '|' is not excluded in the character classes here.

                Examples that match:
                  "org/model"
                  "org/model@main"
                  "org/model:custom_kernel"
                  "org/model@v1.2.3:custom_kernel"
            experts_implementation (`str`, *optional*):
                The experts implementation to use in the model (if relevant). Can be any of:

                - `"eager"` (sequential implementation of the experts matrix multiplications).
                - `"batched_mm"` (using [`torch.bmm`](https://pytorch.org/docs/stable/generated/torch.bmm.html)).
                - `"grouped_mm"` (using [`torch.nn.functional.grouped_mm`](https://docs.pytorch.org/docs/main/generated/torch.nn.functional.grouped_mm.html)).

                By default, if available, `grouped_mm` will be used for torch>=2.9.0. The default is otherwise the sequential `"eager"` implementation.

            > Parameters for big model inference

            dtype (`str` or `torch.dtype`, *optional*, defaults to `"auto"`):
                Override the default `torch_dtype` and load the model under a specific `dtype`. The different options
                are:

                1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified
                  `dtype`, ignoring the model's `config.dtype` if one exists. If not specified
                  - the model will get loaded in `torch.float` (fp32).

                2. `"auto"` - A `dtype` or `torch_dtype` entry in the `config.json` file of the model will be
                  attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in
                  the checkpoint that's of a floating point type and use that as `dtype`. This will load the model
                  using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how
                  the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32.

                3. A string that is a valid `torch.dtype`. E.g. "float32" loads the model in `torch.float32`, "float16" loads in `torch.float16` etc.

                <Tip>

                For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or
                reach out to the authors and ask them to add this information to the model's card and to insert the
                `dtype` or `torch_dtype` entry in `config.json` on the hub.

                </Tip>

            device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                A map that specifies where each submodule should go. It doesn't need to be refined to each
                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
                like `1`) on which the model will be allocated, the device map will map the entire model to this
                device. Passing `device_map = 0` means put the whole model on GPU 0.

                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                more information about each option see [designing a device
                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
            max_memory (`Dict`, *optional*):
                A dictionary device identifier to maximum memory if using `device_map`. Will default to the maximum memory available for each
                GPU and the available CPU RAM if unset.
            tp_plan (`Optional[Union[dict, str]]`, *optional*):
                A torch tensor parallel plan, see [here](https://pytorch.org/tutorials/intermediate/TP_tutorial.html). Use `tp_plan="auto"` to
                use the predefined plan based on the model. If it's a dict, then it should match between module names and desired layout.
                Note that if you use it, you should launch your script accordingly with `torchrun [args] script.py`. This will be much
                faster than using a `device_map`, but has limitations.
            tp_size (`str`, *optional*):
                A torch tensor parallel degree. If not provided would default to world size.
            device_mesh (`torch.distributed.DeviceMesh`, *optional*):
                A torch device mesh. If not provided would default to world size. Used only for tensor parallel for now.
                If provided, it has to contain dimension named `"tp"` in case it's > 1 dimensional, this dimension will be used for tensor parallelism
            offload_folder (`str` or `os.PathLike`, *optional*):
                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
            offload_buffers (`bool`, *optional*):
                Whether or not to offload the buffers with the model parameters.
            quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*):
                A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g
                bitsandbytes, gptq).
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            variant (`str`, *optional*):
                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                is not installed, it will be set to `False`.
            weights_only (`bool`, *optional*, defaults to `True`):
                Indicates whether unpickler should be restricted to loading only tensors, primitive types,
                dictionaries and any types added via torch.serialization.add_safe_globals().
                When set to False, we can load wrapper tensor subclass weights.
            key_mapping (`dict[str, str], *optional*):
                A potential mapping of the weight names if using a model on the Hub which is compatible to a Transformers
                architecture, but was not converted accordingly.
            kwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                automatically loaded:

                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
                      initialization function ([`~PreTrainedConfig.from_pretrained`]). Each key of `kwargs` that
                      corresponds to a configuration attribute will be used to override said attribute with the
                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
                      will be passed to the underlying model's `__init__` function.

        <Tip>

        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
        use this method in a firewalled environment.

        </Tip>

        Examples:

        ```python
        >>> from transformers import BertConfig, BertModel

        >>> # Download model and configuration from huggingface.co and cache.
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased")
        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
        >>> model = BertModel.from_pretrained("./test/saved_model/")
        >>> # Update configuration during loading.
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
        >>> assert model.config.output_attentions == True
        ```
        r   Nr^  output_loading_infoF_from_pipeline
_from_autor   rS  r   
max_memoryoffload_folderr   r  rc  rd  rh  rR  adapter_kwargsadapter_namer  r  rW  r8  tp_sizer6  r   trust_remote_coderH  r=  key_mappingr  )mirror
_fast_initlow_cpu_mem_usagefrom_tf	from_flaxoffload_state_dictT)r\  r]  r^  r_  r`  ra  rc  re  zq`state_dict` cannot be passed together with a model name or a `gguf_file`. Use one of the two loading strategies.
WORLD_SIZErw   a  You've set device_map=`auto` while triggering a distributed run with torchrun. This might lead to unexpected behavior. If your plan is to load the model on each device, you should set device_map={: PartialState().process_index} where PartialState comes from accelerate library)rR  r   r   zIaccelerate is required when loading a GGUF file `pip install accelerate`.r%  pytorch)	file_typer   from_auto_classusing_pipeline)return_unused_kwargsrW  rM  rL  rT  rU  zYou cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub.r  zxOne or more modules is configured to be mapped to disk. Disk offload is not supported for models loaded from GGUF files.zA kernel_config was provided but use_kernels is False; setting use_kernels=True automatically. To suppress this warning, explicitly set use_kernels to True.transformers_weights)r   rR  rW  r   r   rX  rY  rZ  r   )load_gguf_checkpointr   r   )return_tensorsmodel_to_loadr  )r%  r   r   r  rH  )r   r   r   r   r   r   r   r   r   r   r   r   r   r   adjust_generation_fnr`  )rQ  load_configrP  );r  r&  r   r   r   r   r   r   rs  rt  rA   rZ   r:   r0   r   r   r  from_pretraineddeepcopyr  r  r  rJ   r   r   r7  r  r  r  modeling_gguf_pytorch_utilsrb  r   r   r  r  r9  r   rS   preprocess_modelr<  r    _torch_distributed_availabler?   r-   r   _load_pretrained_model_finalize_model_loadingevalrI  r  r  re  r   r!  r/   r   postprocess_modelload_adapterto_dict)=r  r   r  r\  r   r]  r_  r`  ra  r   r   
model_argsrm  r   r^  rK  from_pipeliner^  r   rS  r   rN  rO  r   r  rc  re  rR  rP  rQ  r  rW  r8  rR  r6  r   rS  rH  r=  rT  r
  r  r   download_kwargs_with_commit_adapter_model_pathrX  config_pathmodel_kwargsr   r  r   r   rb  dummy_modelmodel_init_contextr%  r   weight_conversionsrf  loading_infodisk_offload_indexs=                                                                r   rg  zPreTrainedModel.from_pretrained.  s   @ ZZd3
**Y-$jj)>F

#3T: **\59

7D)jj5ZZd3
ZZd3
$4d; **%6>$jj)>EJJ{B/	jj6**Y- **%5r:@bFFHzz.)<"JJ':DAJJ{D1	**Y-**Y-06

;OQU0Vjj5"JJ':DAjj6

?D9jj5)goG p 	'D

4&A	' "".EKE=E%5# #, 0 "
 'V&U-&U#!'D'PT]Ti D  C

|S(I$JKKc '"5/Lkj0,JW  )@)Bhii!NM`)'N
 N
J:N
 .j9
#*Wfg
$+8J'( &"23$*$6&<YK#C3#3#3#C#C$%)#*,$ "$ $ FL l*  -&**>;GK]]6*F!L!&.+FK5@#M2 !F**0**5J*KF'#v--3ZZ8P-QF*+;'\:,
(fj '  Y  %J-&J<M<M<O2OTZ^hTh". 
 $[ o K-K*G+7!??$6+26;QSW+X	.
** $4/I f% *!&k*-.>q.ARVfqrJ
 ##V-=z<Ye
 < 11%GYZv&/0 	<<|<E'--)%5 + . 	 **51
 :%l['K,C$UG5GV]^E !(
JUJ **G$;-! .+!%#%-++
  ,/+E+EeZYikv+w((225+|T

k=9 GE3I$JS\&E&&!-	
 " #4  !c#j.?.?.A*B&Ca&G|ZQcetu#!-E** * */w' --#)'-	 . L ,..000E* *	 	s   3	Y='%Z
=Z
Zr%  r  rf  c           	      f   |j                   }|xr@ |j                  j                  j                  t        j
                  t        j                  hv }t        | j                         j                               }t        j                  t        j                  k\  rt        |t        | dd             d}|j                   _d|j                   j#                         v rCt%        | |j&                  ||j                   |j(                  |j*                  |j,                        }|j                   /|s-t/        |j                   |      }t1        | ||j                         g }	t3               rj|sh|2i }
|D ])  }|
j5                  t7        |d|j8                               + |
}t;        | ||      \  }	}t=        ||	t?               t?               i       }||fS t?               }||}
n|h|d   jA                  d      rT|Ri }
|D ]J  }tC        |d	d
      }|jE                  |       |j                         D ]  }|jG                  |      |
|<    L n1|$i }
|D ]  }|
j5                  t7        |              ntI        d      tK        | |
|| jL                  |      \  }}|D ]  }|jO                  ddd        ||fS )zzPerform the actual loading of some checkpoints into a `model`, by reading them from disk and dispatching them accordingly.r  Nr  r   r   )rG  
error_msgsunexpected_keysmismatched_keysconversion_errorsr   r   r   )r   r   z5Neither a state dict nor checkpoint files were found.)r%  r   rf  r8  r|  )(r   r   r  r  rr   r(  r-  r   r   r   rs  levelre   WARNINGrC   r  r   r   r.   r   r   r   r   r1   caching_allocator_warmupr+   r+  r   r   r4   rn   r!  r   r   r  r   r   r#   r  __exit__)r%  r   r  rf  r   is_hqq_or_quarkexpected_keysr|  expanded_device_mapr~  merged_state_dict	ckpt_filerG  r{  all_pointerfilefile_pointerr   s                     r   rl  z&PreTrainedModel._load_pretrained_model  s    #//& 
;+C+C+W+W+d+d""$$i
 ,
 U--/4467<<7??*='%T*JK "!!-&K<R<R<Y<Y<[2[!8// &&,,!!**" !!-o"3K4J4JM"Z$U,?AYAYZ
%'!$&!!1 I%,,'	T_TlTlm /
'HPZ\g'h$J,)% # #"$LL ///= %K%$.!!-2B12E2N2N~2^cmcu$&!, ID#,TT%#PLOOL1)..0 I/;/E/Ea/H)!,II "-$&!!1 II%,,_Y-GHI !!XYY/S,'#50,L, ! -

4t,- ///r   r{  c           	      "   	 | j                  |j                                | j                          | j                  |j                         |j                  |j
                  |j                         | j                  |j                         | j                  |j                  d       | j                  |       t        | |j                  |j                  |t               |S # t        | |j                  |j                  |t               w xY w)a$  Perform all post processing operations after having loaded some checkpoints into a model, such as moving
        missing keys from meta device to their expected device, reinitializing missing weights according to proper
        distributions, tying the weights and logging the loading report.F)rG  rH  )r%  r   r   r{  rs  )$_adjust_tied_keys_with_tied_pointersmissing_and_mismatched mark_tied_weights_as_initialized&_move_missing_keys_from_meta_to_devicer   r   r   _initialize_missing_keysr   rZ  rG  #_adjust_missing_and_unexpected_keysro   r   r   rs  )r%  rf  r{  s      r   rm  z'PreTrainedModel._finalize_model_loadingm  s    	66|7Z7Z7\] 224 88335&&''((	 **;+C+CD <+D+DX]^ 55lC!.9.W.W(3(K(K)  ".9.W.W(3(K(K)s   B8C$ $*Dc           
      X   |D ch c]%  }dj                  |j                  d      d d       ' }}|j                  |D ch c]H  }t        |      dkD  s|d   j	                         s&dj                  |j                  d      d d       J c}      }g }| j                         D ]w  \  }}|r!| j                   d}	|j                  |	      }n9|r7t        |      dkD  rdj                  | j                  |g      n| j                  }||v sg|j                  |       y |S c c}w c c}w )Nr  r   r   )	ro  r  unionr   isdigitr  r  r  r  )
r   r:  
add_prefixremove_prefixr  module_keysretrieved_modulesr
  r  _prefixs
             r   retrieve_modules_from_namesz+PreTrainedModel.retrieve_modules_from_names  s$   @EFsxx		#s 34FF "''6;bss3x!|PSTVPWP_P_PaSXXciinSb)*b
  ..0 	1LD&!334A6((1CFt9q=sxx!7!7 >?VZVlVl{"!((0	1 ! ) G
 cs   *D" D'D'(%D'c                     t        |t              s|j                  }ddlmc m} t        ||      st        | d      || _        y)aU  
        Register this class with a given auto class. This should only be used for custom models as the ones in the
        library are already mapped with an auto class.



        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`):
                The auto class to register this new model with.
        r   Nz is not a valid auto class.)	r   r   r   transformers.models.automodelsr  r  r   r  )r  
auto_classauto_modules      r   register_for_auto_classz'PreTrainedModel.register_for_auto_class  sC     *c*#,,J66{J/
|+FGHH$r   c           
         t        |      ry|| j                  j                  y| j                  j                  |ddddgf   v rd}t        | j                  dd      }| j                  j                  -| j                  j                  | j                  j                  k(  s^| j                  j
                  -| j                  j
                  | j                  j                  k(  s|g|| j                  j                  k(  rN|d| j                  j                   d| j                  j                   d| j                  j
                   d	| d
	z  }t        j                  |       yy)zv
        Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
        Nr   r   zWe strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.sep_token_idz5
You may ignore this warning if your `pad_token_id` (z&) is identical to the `bos_token_id` (z), `eos_token_id` (z), or the `sep_token_id` (z ), and your input is not padded.)rm   r  pad_token_idr  bos_token_ideos_token_idrs  r  )r   r  r  warn_stringr  s        r   %warn_if_padding_and_no_attention_maskz5PreTrainedModel.warn_if_padding_and_no_attention_mask  sJ    i &DKK,D,D,L ;;##yRG'<<F  #4;;EL))5$++:R:RVZVaVaVnVn:nKK,,8T[[=U=UY]YdYdYqYq=q ,AYAY1YLT[[MeMeLf g..2kk.F.F-GGZ[_[f[f[s[sZt u..:^;[] ,- =r   c                 |    | j                   yt        | j                  dd      y| j                  j                  yy)zJ
        Returns whether the model has a tensor parallelism plan.
        NTr  F)r  r  r  r  r'  r   s    r   supports_tp_planz PreTrainedModel.supports_tp_plan  s=    
 ==$4??J5A;;))5r   c                     | j                   S )z@
        Returns the model's tensor parallelism degree.
        )r  r   s    r   rR  zPreTrainedModel.tp_size  s     }}r   c                 N    | j                   yt        | j                  dd       yy)NTr  F)r  r  r  r   s    r   supports_pp_planz PreTrainedModel.supports_pp_plan  s(    ==$4??J5Ar   c                     t        | d      r| j                  S t        | dd       }||t        vrt        j                  d| d       d}t        |   S )N_loss_functionr  z`loss_type=zZ` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.ForCausalLM)r  r  r  rD   rs  r  )r   r  s     r   loss_functionzPreTrainedModel.loss_function  se    4)*&&&D+t4		 =i[ )= > &II&&r   c                     || _         y r   )r  r   r  s     r   r  zPreTrainedModel.loss_function  s
    #r   c                     t               st        d      ddlm}m}m} | j                  s|j                  n||j                  n|} ||  || j                  j                        |       d| _        y )Nz]Kernels are not available. To use kernels, please install kernels using `pip install kernels`r   )DeviceMode	kernelize)r0  )r   modeT)r`   r   rB  r  r  r  training	INFERENCETRAININGr   r0  _use_kernels)r   r  r  r  r  s        r   r  zPreTrainedModel.kernelize  s_    #%o  	43%)]]t~~[_$v4;;+;+;<4H r   c                     t        | dd      S )Nr  F)r  r   s    r   rH  zPreTrainedModel.use_kernels&  s    t^U33r   r  c                     t        |      rt        | dd      ry |r| j                          y t        | dd      rt        j	                  d       d| _        y )Nr  FzmDisabling kernels at runtime is a no-op as there is no 'unkernelize' routine; keeping current kernels active.)r   r  r  rs  r  r  r  s     r   rH  zPreTrainedModel.use_kernels*  sP     ;74?NNt^U3## D !&Dr   compile_configc                 p   d| j                   j                  v r| j                  S |xs
 t               }t	        | j
                  dd      xs
 t               }t        | d      rt	        | d|      |k7  r:|| _        t        j                  | j                  fi |j                         | _        | j                  S )a  Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between
        non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't
        want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding
        (where we want the speed-ups of compiled version with static shapes).llama4r  N_compiled_call_last_compile_config)r  
model_type__call__r'   r  r  r  r  r   r:  rq  r  )r   r  default_configs      r   get_compiled_callz!PreTrainedModel.get_compiled_call9  s     t{{---== ':=? !7!79I4PcTaTc./t3^DV(6D%"'--"ZAWAWAY"ZD"""r   c                     | j                   S r   )r  )r  s    r   is_backend_compatiblez%PreTrainedModel.is_backend_compatibleK  s    ...r   c                     t        t              } j                         j                         D ]'  \  }}||j	                            j                  |       ) |j                         D cg c]:  }t        |      dkD  r*t         fd|D              st        fd|D              s|< }}|D ci c]  }|dd D ]  }||d   
  }}} j                  j                  |       yc c}w c c}}w )a  
        Adds keys to `self.all_tied_weights_keys` by checking if any group of params
        share the same data ptr. It helps us support remote code where the weight tying is
        done in old-T5 style, by manually assigning the same module to different param names.
        If we don't add them back in `self.all_tied_weights_keys`, they will be re-initialized
        and all params in tied group get random weights.
        r   c              3   V   K   | ]   }|j                   j                         v  " y wr   )r$  r   )r+  r
  r   s     r   r-  zGPreTrainedModel._adjust_tied_keys_with_tied_pointers.<locals>.<genexpr>a  s%     Td : : ? ? AATs   &)c              3   &   K   | ]  }|v  
 y wr   r   )r+  r
  rG  s     r   r-  zGPreTrainedModel._adjust_tied_keys_with_tied_pointers.<locals>.<genexpr>b  s     ?,?s   Nr   )r   r   r   r.  r   r  r   r   r3  r;  r$  r+  )	r   rG  param_pointersrH  param_valuer:  tied_param_namesgrouptied_weights_keys_by_pointerss	   ``       r   r  z4PreTrainedModel._adjust_tied_keys_with_tied_pointersO  s    %T*'+'8'>'>'@ 	F#J;//1299*E	F (..0
5zA~TeTT???	 
 
 /?)
%*QVWXWYQZ)
CMJa )
)
% )
 	""))*GH
)
s   ,?C(2C-r   r   z/torch.distributed.device_mesh.DeviceMesh | Noner   c                    |du}t               r|syt               rt               s{|sy| j                         D ])  \  }}t	        j
                  |d      }t        | ||       + | j                         D ])  \  }}	t	        j
                  |	d      }t        | ||       + y|| j                  j                         z
  D ]h  }| j                  |      }t        ||d      }
t	        j
                  ||
      }|!t        | |||dd|j                         |       \t        | ||       j | j                         D ]7  \  }}	t        ||d      }t	        j
                  |	|      }t        | ||       9 y)a  Move the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts)
        back from meta device to their device according to the `device_map` if any, else cpu. Takes care of sharding those
        missing parameters if `device_mesh` is provided, i.e. we are using TP.
        All non-persistent buffers are also moved back to the correct device (they are not part of the state_dict, but are
        not missing either).
        Nr   r  T)valid_torch_deviceF)r+   r,   r   r  r   
empty_likerP  r$  r$  r   rK  r2   rB   get_local_ranknamed_non_persistent_buffers)r   rG  r   r   r   r   r  r  r  bufferparam_devicebuffer_devices               r   r  z6PreTrainedModel._move_missing_keys_from_meta_to_devicel  sz    $4/%' %9%;L"335 =
U((u=*4e<=  $113 =V((>*4e<= 
  $"<"<"A"A"CC 	=C005E%j#$OL$$U<@E&+%T5+:T:T:VXc
 +4e<	=  <<> 	9KC&z34PM$$VMBE&tS%8	9r   c           
      ^   t               r|sddl}t        | j                  d      j	                         D ch c]  }t        |dd      r| c}      }|j                  j                  |d      5  | j                          ddd       y| j                          yc c}w # 1 sw Y   yxY w)a  
        Initialize the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts), according to
        `_initialize_weights`. Indeed, since the corresponding weights are missing from the state dict, they will not be replaced and need to
        be initialized correctly (i.e. weight initialization distribution).

        Params that are not missing have the `is_hf_initialized` flag.
        r   NT)	keep_varsr   Frh  )	r+   rY  r   r   r   r  rZ  rk  r+  )r   r   rY  r/  not_initialized_parameterss        r   r  z(PreTrainedModel._initialize_missing_keys  s     &' *. OOdO;BBDtqGTUWkmrLst*& 223M]^2_ *'')* * ##% u* *s   BB3B##B,c                 h   t        d | j                         D              }|rdgng }| j                  xs g }| j                  xs g |z   }d\  }}t	        |      dkD  r+t        j                  dj                  d |D                    }t	        |      dkD  r+t        j                  dj                  d |D                    }|1|j                  D ch c]  }|j                  |      | c}|_        |2|j                  D ch c]  }|j                  |      | c}|_
        yyc c}w c c}w )	zAdjust the `missing_keys` and `unexpected_keys` based on current model's exception rules, to avoid
        raising unneeded warnings/errors. This is performed in-place.
        c              3   D   K   | ]  \  }}|j                  d         yw)zrotary_emb.inv_freqN)r   )r+  r  r  s      r   r-  zFPreTrainedModel._adjust_missing_and_unexpected_keys.<locals>.<genexpr>  s     "pifVW6??3H#I"ps    zrotary_emb\.inv_freqNNr   r  c              3   (   K   | ]
  }d | d  ywr  r  Nr   r+  patterns     r   r-  zFPreTrainedModel._adjust_missing_and_unexpected_keys.<locals>.<genexpr>  s     6g7!G9A6g   c              3   (   K   | ]
  }d | d  ywr  r   r  s     r   r-  zFPreTrainedModel._adjust_missing_and_unexpected_keys.<locals>.<genexpr>  s     9mgQwiq/9mr  N)r3  r$  r  r  r   r)  r:  ro  rG  r*  r  )	r   r{  has_inv_freq_buffersadditional_unexpected_patternsmissing_patternsunexpected_patternsignore_missing_regexignore_unexpected_regexr  s	            r   r  z3PreTrainedModel._adjust_missing_and_unexpected_keys  s;     #"p[_[m[m[o"ppFZ*A)B`b&??E2#FFL"Pnn8B55 1$#%::chh6gVf6g.g#h "#a'&(jj9mYl9m1m&n#  ++88)<P<W<WX[<\<d)L%
 #.+;;,?V?]?]^a?b?j,L( /),s   D*)D*D/D/c                 r    | j                   j                         D ]  }| j                  |      }d|_         y)aE  Adds the `_is_hf_initialized` flag on parameters that will be tied, in order to avoid initializing them
        later as they will be tied (overwritten) anyway.
        This is very important as most embeddings are tied, and they are huge params (vocabularies are often 256k), so
        running inits on them is very costly.TN)r$  r   r1  r   )r   
tied_paramr  s      r   r  z0PreTrainedModel.mark_tied_weights_as_initialized  s:    
 4499; 	,J&&z2E'+E$	,r   rk  c                    	 | j                  |      S # t        $ r Y nw xY w	 | j                  |      S # t        $ r Y nw xY wt        | |      \  }}|dk(  rmt	        |j
                  dt        j                  j                  j                        t        j                  j                  j                  ur|j                         S t        d| d      )ai  
        Return the parameter or buffer given by `target` if it exists, otherwise throw an error. This combines
        `get_parameter()` and `get_buffer()` in a single handy function. If the target is an `_extra_state` attribute,
        it will return the extra state provided by the module. Note that it only work if `target` is a leaf of the model.
        _extra_stateget_extra_state`z2` is neither a parameter, buffer, nor extra state.)
r1  AttributeError
get_bufferrK   r  r  r   r   r  r  )r   rk  r  rH  s       r   rK  z'PreTrainedModel.get_parameter_or_buffer  s    	%%f-- 			??6** 		1$?
.(((*;UXX__=\=\]88??223 ))++q(Z[\\s    	4 	A A recurser-  c              #      K   | j                  ||      D ]H  \  }}d|v r|j                  dd      nd|f\  }}| j                  |      }||j                  v sC||f J yw)zSimilar to `named_buffers`, but only yield non-persistent ones. It is handy as it's not perfectly straightforward
        to know if they are persistent or not)r  r-  r  r   rd  N)r$  rT  rL  _non_persistent_buffers_set)r   r  r-  r
  r   rN  buf_names          r   r  z,PreTrainedModel.named_non_persistent_buffers  sw     
 !..wQa.b 	#LD& 7:Tkt{{32DzFH''/F6===Fl"	#s   AA"	A"r  c                 ^    t         |   |      }| j                  r| j                          |S r   )r  trainrH  r  )r   r  outr  s      r   r  zPreTrainedModel.train  s)    gmD!NN
r   c                 $    | j                  d      S )NF)r  r   s    r   rn  zPreTrainedModel.eval  s    zz%  r   r   Fr  )NNT)NFT)TNF50GBNNTT)Tr  )	AutoModel)TT)r   r   r   r   r  r0  r   r   r  r  r   r  r   r  r   r  r  r  r!  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   compilerallow_in_graphrq   r   r   r  r  r  r4  r8  r  r;  setterrI  r-  rQ  classmethodr]  r   r  r  r  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  no_gradr"  r%  r  guard_torch_init_functionsr+  r)  rZ  rM  r   r  rn  rj  rq  r	  rr  rz  r  r  r  r  r  r,  rL  r   r   r  r  r  r   PathLiker  r   rU   r  r"  rr  r   r2  r5  r   r9  r<  rT   rI  ry   rg  r  r   rn   rl  rm  r  r  r  r  rR  r  r  r  rH  r'   r  r  r  rI   r  r  r  r  rK  r   r  r  rn  __classcell__r  s   @r   rz   rz   A  s   ( 37L$'(4/6KsL$#'JS	D ' 'OS& )/cDIo. 6:s3x$s)+d29:>tCy4!7>
 :>3s8d3i/$6=@D #c(T#Y"6"=D *.S#X-57"DcN78<#T#Y%5<;?&S	D(8?04T#Y-4 !ND !&$& %%  $Hd38n#H 48Hd3(()D07 -2#T1#(D( ).-'++
^^""'.Dn)<$= '. # '.R 9d3#45 9 9/0#M/ #MJ/>b c3h   c5c?23   ^^!DcNT1 ! !F ^^DeCHo!56  
:;,49s? ,t ,@ ) )V ;BII ; ; #T # #J] ] ]~K K KZ  <$  T d 8 FKO.#&:O.>BO.	O.b1sUYz 1^a 1 $3:  $^b  $or  $D"C$J "SV "( T  $ 5 5 5a83: a8F*Wt *WX*X)C$J @%S4Z %4.%" U]]_7? 7?r)$ U]]_$T$$&3 ' 34p%D p%T p%dN8C4 N8SW N8`
M &*)-"	9d
9  $J9 	9
 
9v%+T &*)-"\\ d
\  $J	\
 \ 
\B &* "YY d
 	
  
B@ !@ @,d
d
c 

bll8K)K 

2(.T :>gq $ \d ,/( n4 n n !%"&!$*"#'!%%)[bkk)[ [ 4K	[
 [ c	[ t[ TzD [ [ #[z >%%&4 '4$ 588?? - !-2 588??:+ :+x'( U[[  Z^  .U[[ T  %%,:M %%N 
 ?C.2(-$!&#''+!a-.a'*R[['84'?a !3&4t;	a
 $t+a "&a a a TzD a a a a 
%a aF X0 X04KX0 s)d*X0 )	X0
 
 $&	'X0 X0t '/'?P'	' 'R!. % %*#-J       ' ' $ $	! 4T 4 4 & &$ & &#0D # #$ / /Ic It I:/93i/9 4K/9 G	/9
 "D(/9 
/9b&T &d &*@Q VZ <,]c ]2 >B##6:#	%U\\)*	+#$ !r   r  z
model file)objectobject_classobject_files	recursivec                     t               ri }|r||d<   t        | fi |S t        | d      rt        | j                        S | S )a  
    Recursively unwraps a model from potential containers (as used in distributed training).

    Args:
        model (`torch.nn.Module`): The model to unwrap.
        recursive (`bool`, *optional*, defaults to `False`):
            Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
            recursively, not just the top-level distributed containers.
    r  r  )rZ   rt   r  r  r  )r%  r  rm  s      r   r  r    sL      "+F;*5;F;; 5(#--Lr   r   c                 P    | dk(  ryt        j                  |       j                  dvS )zCheck if the device is an accelerator. We need to function, as device_map can be "disk" as well, which is not
    a proper `torch.device`.
    r  F)r   r   )r   r   r0  r  s    r   is_accelerator_devicer  (  s)     ||F#((??r   accelerator_device_mapc                 T   t        d       }| j                  j                         }t        j                  j                         r*t        j                  j                         r| j                  ng }|j                         D ]  \  }}||v r| j                  |      }||j                  | ||      }	n|j                         }	|j                         |	z  }
t        |      dkD  r5t        ||d      du}|
|rt        j                  j                         ndz  }
||xx   |
z  cc<    |S )z
    This utility function calculates the total bytes count needed to load the model on each device.
    This is useful for caching_allocator_warmup as we want to know how much cache we need to pre-allocate.
    c                       y)Nr   r   r   r   r   r5  z&get_total_byte_count.<locals>.<lambda>:      r   Nr   T)	is_weightr   )r   r$  r   r   r   r   r   r  r.  rK  param_element_sizer   r  r   r>   get_world_size)r%  r  r   total_byte_countr  r8  rH  r   r  
dtype_sizeparam_byte_countis_part_of_plans               r   get_total_byte_countr  2  s    #9-22779 % 1 1 > > @UEVEVEeEeEgennmoG4::< 5
F))--j9#%88
ERJ++-J ;;=:5w<!4ZTXYaeeO!2!2!A!A!C\]] $44 %5& r   r  c                    |j                         D ci c]'  \  }}t        |      s|t        j                  |      ) }}}|syt	        | ||      }|j                         D ]  \  }}|j
                  dv rt        t        |j
                        }|j                  |j                  n|j                         }	|j                  |	      \  }
}|j                  |	      |j                  |	      z
  }||z
  |kD  r||z
  }n||z
  dkD  r|dz   |
kD  rd}n|dz   }nd}t        ||dz
        }t        j                  t        |dz        t        j                  |d	      } yc c}}w )
aH  This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
    device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
    the model, which is actually the loading speed bottleneck.
    Calling this function allows to cut the model loading time by a very large margin.

    A few facts related to loading speed (taking into account the use of this function):
    - When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
    to cache the different state dicts (if enough resources/RAM are available)
    - Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
    and not a good idea in general as this is low level OS optimizations that depend on resource usage anyway
    - As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
    The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
    These numbers are reported for TP on 4 H100 GPUs.
    - It is useless to pre-allocate more than the model size in this function (i.e. using an `allocation_factor` > 1) as
    cudaMalloc is not a bottleneck at all anymore
    - Loading speed bottleneck is now almost only tensor copy (i.e. changing the dtype) and moving the tensors to the devices.
    However, we cannot really improve on those aspects obviously, as the data needs to be moved/copied in the end.
    N)rr  xpug      Ar   r   g333333Ar  F)r   r   rJ  )r.  r  r   r   r  r0  r  r  current_devicemem_get_infomemory_reservedmemory_allocatedr  r   r   rt  )r%  r  r   r  r   r  r  
byte_countaccelerator_moduler  free_device_memorytotal_device_memoryunused_memoryr  s                 r   r  r  T  s|   * :M9R9R9T(5vXmntXuu||F##  "+E3I<X /446 #g
;;/)!(!<$*LL$<FLLBTBcBcBeE6H6U6UV[6\3 3.>>uEHZHkHklqHrrM M)M9'-7
m+m; !1$'99!"J "/!2J 
 Z)<})LMJKKJ!O,EMM&`efG#gs
   EEc            	       F     e Zd ZdZeeeeeeee	dZ
dededef fdZ xZS )AttentionInterfacea_  
    Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
    with a call to `register()`. If a model needs to locally overwrite an existing attention function, say `sdpa`,
    it needs to declare a new instance of this class inside the `modeling_<model>.py`, and declare it on that instance.
    )r}   r|   r  r  zpaged|flash_attention_3zpaged|flash_attention_2z
paged|sdpazpaged|eagerrT  r  r   c                     |t         j                  d       n|dk7  r|| vrt        d| d      t        |   ||      S )zcReturn the requested `attn_implementation`. Also strictly check its validity, and raise if invalid.a	  You tried to access the `AttentionInterface` with a `config._attn_implementation` set to `None`. This is expected if you use an Attention Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._attn_implementation`r  r  zP` is not a valid attention implementation registered in the `AttentionInterface`)rs  r  KeyErrorr  r   )r   rT  r  r  s      r   get_interfacez AttentionInterface.get_interface  s_    &K
 !G+0C40O'((xy  w{.88r   )r   r   r   r   r6   r8   r;   r7   r<   r5   _global_mappingr   r   r-  r	  r
  s   @r   r*  r*    sH     540&#:#:24	O9 9x 9H 9 9r   r*  r  c                   d    e Zd ZdZedej                  fd       Zedej                  fd       Zy)PreTrainedAudioTokenizerBasea  
    Class that additionally defines the behavior of any `audio_tokenizer` to be added.
    Characteristic for any of them:
        1. Encode raw audio into discrete audio codebooks (with x channels)
        2. Decode from discrete audio codebooks back to raw audio
    It is possible that they can decode in different ways given a different representation
    but they are forced to support 2. nonetheless, e.g. see `DAC`.
    input_valuesc                      y)z
        Encode raw audio retrieved from a respective `FeatureExtractor` into discrete audio codebooks (with x channels)
        Nr   )r   r1  rl  rm  s       r   encodez#PreTrainedAudioTokenizerBase.encode  r  r   audio_codesc                      y)z6Decode from discrete audio codebooks back to raw audioNr   )r   r4  rl  rm  s       r   decodez#PreTrainedAudioTokenizerBase.decode  r  r   N)	r   r   r   r   r   r   r   r3  r6  r   r   r   r0  r0    sH     5<<  
 E%,, E Er   r0  r   )r   Tr  r  (  r   r&  r  importlib.metadatarl  r  r  r   r)  r  r@  abcr   r   collections.abcr   r   
contextlibr   dataclassesr   r	   enumr
   r   r   	itertoolsr   	threadingr   typingr   r   r   zipfiler   r   huggingface_hubr   r   r   	packagingr   safetensorsr   safetensors.torchr   r  r   r   torch.distributionsr   torch.utils.checkpointr   rd  r   r  configuration_utilsr   conversion_mappingr    core_model_loadingr!   r"   r#   r$   r   r%   dynamic_module_utilsr&   
generationr'   r(   integrationsr)   r*   r+   r,   integrations.accelerater-   r.   r/   r0   r1   r2   r3   integrations.deepspeedr4   integrations.eager_pagedr5   integrations.flash_attentionr6   integrations.flash_pagedr7   integrations.flex_attentionr8   rC  r9   integrations.peftr:   integrations.sdpa_attentionr;   integrations.sdpa_pagedr<   integrations.tensor_parallelr=   r>   r?   r@   rA   rB   rC   loss.loss_utilsrD   modeling_flash_attention_utilsrE   rF   modeling_rope_utilsrG   pytorch_utilsrH   
quantizersrI   quantizers.autorJ   quantizers.quantizers_utilsrK   safetensors_conversionrL   utilsrM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   utils.genericrf   rg   	utils.hubrh   ri   rj   utils.import_utilsrk   rl   rm   utils.loading_reportrn   ro   utils.output_capturingrp   rq   utils.quantization_configrr   accelerate.hooksrs   accelerate.utilsrt   r   rk  !smdistributed.modelparallel.torchmodelparallelr  smdistributed.modelparallelru   SMP_VERSIONrp  r  
get_loggerr   rs  r   r   upperrv   rx   ry   r   r   rk  r   r   r   r   r   r   r   r   r   r   uint8int8int16uint16rt  ru  int32uint32r   float64int64uint64float8_e4m3fnfloat8_e5m2r   r  r   r   r   r   r  r  r   r  r!  r  r  r$  rG  rP  rV  r  r  r  r  r  rz   r  r   r  r  r  r  r  r*  r  r   r0  r   r   r   <module>ry     s8         	 	 
   # . % (  $   4 4   \ \  ! 9  + - $ 1 <  + 4 7 i i   F C A = ? / 2 ? A   * j 4 , # - = 3      6 J \ \ 
 K H 9 3<  %00==? 33F -k :mgmmF>S S % 
		H	%zz~~nc288:JJNN#6<BBD %&CK\]   9=  $- - -4   # # 0U[[ 0C$J 0 0.1 JJ
++
**;;<<==NN;;<<====;;<<""   ( hlk2;;&k69ELL6Hk`dk	#u||
kDU\\ c ")) S	 ,DSN ,S%,,=N8O ,TYZ^_bcf_gZhjnorjsZsTt ,>%T#c(^ %c5<<>O9P %UZ[_`cdg`h[iknorks[sUt %&MS%,,&'M0AM	#u||
M`(&7 (S (RWR^R^ (s S4Z 3  26-1F.#&#4t#;F.4ZF. TzF. D[	F.
 tF. F. %($JF. $d*F. 49tTD[()F.` (,Nt#d*N3i$&N N Tk	N
 tN N $N U[[()Nbt 
[ [|U* U*pD7!bii!57GYi D7!Nn ((C(CD &&2*9*E*E*M*M*T*T[| +U +O'
		 d ryy 2@#)ell": @t @ ^b48HSVZHZD@gO @g$ @g^ilp^p @gF 9)  9H /A.B + BE? Er   