
    謜iK                    <   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlZddlZ ddl!Z!ddl"m#Z$ dd	l%m&Z&m'Z'm(Z(m)Z) dd
l*m+Z+ ddl!m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z<m=Z= ddl>m?Z? ddl@mAZA ddlBmCZCmDZD ddlEmFZF ddlGmHZHmIZImJZJmKZKmLZL ddlMmNZNmOZO ddlPmQZQ ddlRmSZSmTZT ddlUmVZV ddlWmXZXmYZYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_ ddl`maZambZb ddlcmdZd ddlemfZf dd lgmhZh dd!limjZjmkZkmlZlmmZmmnZnmoZompZpmqZq dd"lrmsZsmtZtmuZumvZv dd#lwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZ dd$lmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ dd%lmZmZmZ dd&lmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ dd'lmZ dd(lmZ ekgZenZ e       rdd)lmZ eZ e       rddlZ e«       rQddlmc mZ ddlmc mZ ddlmZ ddlm4Z  e+j                  e٫       e+j                  e      k\  Zerddlm#c mZ nd*Z e       rddlmc m!Z dd+lwmZmZmZ  e       rdd,lmZ  e       r7dd-lmZmZ dd.lmZ dd/lmZmZmZmZmZmZmZmZmZ dd0lmZ  eK       rdd1lmZ erddlZ eÐj                  e      Zd2Zd3Zd4Zd5Z d6Zd7Zd8Z ed9:       G d; d<             Zy)=uc   
The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
    N)CallableIteratorMappingpartial)Path)TYPE_CHECKINGAny   )#get_reporting_integration_callbacks)
CommitInfo	ModelCardcreate_repoupload_folder)version)nn)
DataLoaderDatasetIterableDatasetRandomSamplerSequentialSampler)__version__)PreTrainedConfig)DataCollatorDataCollatorWithPaddingdefault_data_collator)DebugOptionDebugUnderflowOverflow)SequenceFeatureExtractor)FeatureExtractionMixin)"ALL_HYPERPARAMETER_SEARCH_BACKENDSdefault_hp_search_backend)BaseImageProcessor)deepspeed_initdeepspeed_load_checkpointdeepspeed_sp_compute_lossis_deepspeed_availablepropagate_args_to_deepspeed)get_fsdp_ckpt_kwargsupdate_fsdp_plugin_peft)apply_liger_kernel)activate_neftunedeactivate_neftune)MIN_PEFT_VERSION)save_tpu_checkpointtpu_spmd_dataloaderwrap_model_xla_fsdp)TrainingSummary)PreTrainedModelunwrap_model)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMESMODEL_MAPPING_NAMES)get_scheduler)ProcessorMixin)PreTrainedTokenizerBase)CallbackHandlerDefaultFlowCallbackExportableStatePrinterCallbackProgressCallbackTrainerCallbackTrainerControlTrainerState)_OPTIMIZER_HANDLERSOptimizerContext_parse_optim_argsis_optimizer_factory)EvalLoopContainerIterableDatasetShardLabelSmootherLengthGroupedSamplerdistributed_broadcast_scalarsfind_batch_sizeget_model_param_countget_parameter_namesis_attention_mask_causalnested_detachnested_gatherreissue_pt_warningsremove_dummy_checkpointsafe_globalsset_rng_state_for_device)PREFIX_CHECKPOINT_DIRBestRunEvalLoopOutputEvalPredictionHPSearchBackendHubStrategyPredictionOutputRemoveColumnsCollatorSaveStrategyTrainerMemoryTrackerTrainOutput_is_peft_modelalign_special_tokens#compare_trainer_and_checkpoint_argsdefault_compute_objectivedenumpify_detensorizeenable_full_determinismfind_executable_batch_sizeget_last_checkpoint
has_lengthload_sharded_checkpointnumber_of_argumentsrotate_checkpointsseed_workerset_seedsort_checkpointsspeed_metricsunwrap_peft_model"validate_quantization_for_training)OptimizerNamesParallelModeTrainingArguments)ADAPTER_CONFIG_NAMEADAPTER_SAFE_WEIGHTS_NAMEADAPTER_WEIGHTS_NAMECONFIG_NAMEGENERATION_CONFIG_NAMESAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEXLA_FSDPV2_MIN_VERSIONPushInProgresscan_return_losscheck_torch_load_is_safefind_labelsis_accelerate_availableis_datasets_availableis_in_notebookis_peft_availableis_sagemaker_dp_enabledis_sagemaker_mp_enabledis_torch_hpu_availableis_torch_mlu_availableis_torch_musa_availableis_torch_neuroncore_availableis_torch_npu_availableis_torch_xla_availablelogging)requires)QuantizationMethod)NotebookProgressCallbackF)smp_forward_backwardsmp_forward_onlysmp_nested_concat)	PeftModel)Acceleratorskip_first_batches)AcceleratorState)	DataLoaderConfigurationDistributedDataParallelKwargsDistributedTypeGradientAccumulationPluginload_fsdp_modelload_fsdp_optimizerrelease_memorysave_fsdp_modelsave_fsdp_optimizer)clear_device_cache)DeepSpeedSchedulerWrapperztraining_args.binztrainer_state.jsonzoptimizer.ptz	scaler.ptzoptimizer.binzscheduler.ptpytorch_model_fsdp)torch
accelerate)backendsc                   6   e Zd ZdZddlmZmZmZmZm	Z	m
Z
mZ 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                  z  dz  dedz  dedz  d	d
dddeez  ez  ez  dz  dedef   dz  dedz  deegef   dz  dee   dz  deej8                  j:                  dz  ej8                  j<                  j>                  dz  f   dee ej8                  j:                     ee!e"f   f   dz  deejF                  ejF                  gejF                  f   dz  fdZ$ddZ%dee!e"f   fdZ&ddZ'de(fdZ)dde!e*z  dz  de(fdZ+de*de(fdZ,de(de-fd Z.	 	 	 dd!e*d"e!d#e-d$ee*gej^                  j`                  jb                  f   dz  d%e2d&e!dz  de(fd'Z3dd	e*dz  dej^                  j`                  jb                  dz  fd(Z4de*dej^                  j`                  jb                  dz  fd)Z5dd*Z6	 dd!d+d"e!dz  dd+fd,Z7dded"e!dz  defd-Z8d.e-ddfd/Z9dej8                  j:                  fd0Z:	 dd.e-d1ej8                  j:                  dz  dej8                  j<                  jv                  fd2Z<e=ddededz  dee"e"f   fd3       Z>dej                  dee!   fd4Z?de@fd5ZA	 	 	 dd6e!e2z  dz  d7d8d9ee!   dz  deBfd:ZC	 	 	 	 	 dd#e-dz  dedz  d6e!dz  d7d8d9ee!   dz  deBfd;ZD	 ddej                  d<ee!ejF                  e"z  f   d=ejF                  e-z  dz  dejF                  fd>ZE	 	 ddej                  d<ee!ejF                  e"z  f   d?e2d=ejF                  e-z  dz  dejF                  eejF                  e"f   z  f
d@ZFdeGj                  fdAZIddBe2dz  deGj                  fdCZK	 ddDejF                  dEejF                  e@z  dz  dej                  d7d8dFe@d9ee!   dz  dGe@dHe@dz  ddfdIZLdJeMdKe-dLej                  deeejF                  e-z  dz  f   fdMZOdNedLej                  dejF                  e-z  dz  fdOZPdPejF                  e"z  dejF                  e"z  fdQZQd<ee!ejF                  e"z  f   dee!ejF                  e"z  f   fdRZRdej                  d<ee!ejF                  e"z  f   deeee!ejF                  e"z  f   f   fdSZSdede(dTe-dee-e-e-e-e2e-dz  e-f   fdUZTdede-fdVZUde-fdWZVde-fdXZWde-fdYZXddej                  dZe2de(dz  dej                  fd[ZY	 	 	 dde*ee!e*f   z  dz  d\ee!   dz  d]e!dee!e@f   fd^ZZ	 	 	 dde(d"e!d_e2dz  d\ee!   dz  d]e!de[fd`Z\	 dde*d\ee!   dz  d]e!de]fdaZ^	 ddej                  d<ee!ejF                  e"z  f   d_e2d\ee!   dz  deejF                  dz  ejF                  dz  ejF                  dz  f   f
dbZ_	 dd7d8d9ee!   dz  dce2dee!e@f   fddZ`d7d8de!fdeZadej                  d7d8ddfdfZbdgee!e@f   d7d8de2fdhZcdie!ddfdjZddie!ddfdkZedie!ddfdlZfdd6e!dej                  dz  ddfdmZgddnZhdoe!dz  ddfdpZidoe!dz  ddfdqZjdoe!dz  ddfdrZkddsZldte"ddfduZmddie!dz  dve2ddfdwZnddie!dz  dxedz  ddfdyZoddzee!e@f   dGe@dz  ddfd{Zpdd|Zqd<ee!ejF                  e"z  f   de-fd}Zrdd~e!dz  ddfdZs	 	 	 	 	 	 	 	 	 dde!dz  de!dz  de!ee!   z  dz  de!dz  de!dz  de!ee!   z  dz  de!ee!   z  dz  d!e!ee!   z  dz  de!ee!   z  dz  ddfdZt	 	 	 	 dde!dz  de2d~e!dz  de!dz  deuf
dZvde!ddfdZwddZx	 	 	 	 	 	 ddedgee!e@f   f   dz  deee!e@f   ge@f   dz  de-de!ee!   z  de!eyz  dz  dedge!f   dz  dezeez   z  fdZ{dd7d8dej                  fdZ|ddZ}d7d8de-dgee!e@f   ddfdZ~de!ddfdZde e   ez  ddfdZde e   ez  dedz  fdZde e   ez  ddfdZde2fdZde2fdZdej                  dLej                  ddfdZy)Traineru  
    Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.

    Args:
        model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*):
            The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.

            <Tip>

            [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use
            your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers
            models.

            </Tip>

        args ([`TrainingArguments`], *optional*):
            The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
            `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
        data_collator (`DataCollator`, *optional*):
            The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
            default to [`default_data_collator`] if no `processing_class` is provided, an instance of
            [`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer.
        train_dataset (`torch.utils.data.Dataset` | `torch.utils.data.IterableDataset` | `datasets.Dataset`, *optional*):
            The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
            `model.forward()` method are automatically removed.

            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
            `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
            sets the seed of the RNGs used.
        eval_dataset (`torch.utils.data.Dataset` | dict[str, `torch.utils.data.Dataset`] | `datasets.Dataset`, *optional*):
             The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
             `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
             dataset prepending the dictionary key to the metric name.
        processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
            Processing class used to process the data. If provided, will be used to automatically process the inputs
            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
            reuse the fine-tuned model.
        model_init (`Callable[[], PreTrainedModel]`, *optional*):
            A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
            from a new instance of the model as given by this function.

            The function may have zero argument, or a single one containing the optuna/Ray Tune trial object, to
            be able to choose different architectures according to hyper parameters (such as layer count, sizes of
            inner layers, dropout probabilities etc).
        compute_loss_func (`Callable`, *optional*):
            A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
            batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618) used by [`Trainer`].
        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
            a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
            `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
            after the last eval batch to signal that the function needs to calculate and return the global summary
            statistics rather than accumulating the batch-level statistics
        callbacks (List of [`TrainerCallback`], *optional*):
            A list of callbacks to customize the training loop. Will add those to the list of default callbacks
            detailed in [here](callback).

            If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
        optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], dict[str, Any]]`, *optional*):
            A tuple containing the optimizer class and keyword arguments to use.
            Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument.

            Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
            A function that preprocess the logits right before caching them at each evaluation step. Must take two
            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
            by this function will be reflected in the predictions received by `compute_metrics`.

            Note that the labels (second parameter) will be `None` if the dataset does not have them.

    Important attributes:

        - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`]
          subclass.
        - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
          original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
          the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner
          model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
        - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
          data parallelism, this means some of the model layers are split on different GPUs).
        - **place_model_on_device** -- Whether or not to automatically place the model on the device. Defaults to
          `True` unless model parallel, DeepSpeed, FSDP, full fp16/bf16 eval, or SageMaker MP is active. Can be
          overridden by subclassing `TrainingArguments` and overriding the `place_model_on_device` property.
        - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
          in `train`)

    r   )get_learning_ratesget_num_trainable_parametersget_optimizer_grouplog_metricsmetrics_formatsave_metrics
save_stateNNNmodelargsdata_collatortrain_datasetz3Dataset | IterableDataset | datasets.Dataset | Noneeval_datasetz6Dataset | dict[str, Dataset] | datasets.Dataset | Noneprocessing_class
model_init.compute_loss_funccompute_metrics	callbacks
optimizersoptimizer_cls_and_kwargspreprocess_logits_for_metricsc                 n   |'d}t         j                  d| d       t        |      }|| _        | j                  j                  rt        | j                  j                        nt        | j                  j                         d | _        || _	        | j                          t        | j                  j                        | _        | j                  j                          |j                         }t!        j"                  |       |j$                   |%||| _        | j)                         }nt+        d      |t-        d      || _        |j.                  j0                  t2        v r#t-        d|j.                  j0                   d      | j                  j4                  r t7        || j                  j8                         t;        |       d	| _        t?        |d
d       tA        |jB                  jE                               D cg c]	  }|dvs| }}tG        |      dkD  rd| _        nBtG        |      dk(  r4| j                  jH                  tK        jH                  |d         k7  | _        |jL                  d   | _'        tG        |jP                        dkD  rN| jR                  rt-        d      |jL                  d   s(|jT                  tV        jX                  k7  rt-        d      |jZ                  |jZ                  | _-        na| j<                  sF| jR                  s:|j\                  s.|j^                  s"| jN                  s| j`                  s
tc               rd	| _-        nd| _-        | jZ                  r:t?        |dd       td        jf                  k7  r| ji                  ||jH                         | j<                  rd| j                  _5        || _6        || _	        to        | jp                  js                  |            }tu        |d      r|jv                  | _<        nNt{        j|                  |j~                        j                  }t        d |jE                         D              | _<        t?        | jp                  dd       }|"|j                  dk(  r|j                  rd	| _<        to        | j                        }t        |j.                        }| j                  j                  |n| j                  j                  | _E        t        |j.                        | _F        | j                  j                  dk7  rlt?        | j                  j                  dd       dk(  r"t        j                  dt               d | _L        n-t        | j                  j                        | _L        nd | _L        |!t        |t        t        f      rt        |      nt        }||n|| _S        || _T        || _U        || _V        |j                  | _W        || _X        |	| _Y        || _Z        |\  | _[        | _\        || _]        | j                          t        t        | j                  j                        z   }| j                  j                  r$ddlcmd}  |       }||gz   }|j                  |        |
|n||
z   }
t        |
| j                  | j                  | j                  | j                        | _g        | j                  | j                  j                  rt        nt               d | _l        | j                  j                  r| j                          | j                  j                  r+t        j                  | j                  j                  d       t               | _t        t        | j                         | j                         | j                  j                  | j                  gz   D cg c]  }t        |t              s| c}      | _z        d	| _{        d | _|        d | _}        d| _~        d	| _        d | _        |j                  | _        d	| _        | j                  j	                  | j                  | j                  | j                        | _t        t?        | j                  dd       0| j                  j
                  | j                  j                  _        |jL                  j                  dd	      | _        | j                  rxt        st-        d       t        j                         }t        j                  t        j                  t        j                  t!        |            |dfd!"             | jN                  xr | j                   | _        | j                  j%                          y c c}w c c}w )#Ntmp_trainerz1No `TrainingArguments` passed, using `output_dir=z`.)
output_dirz<`Trainer` requires either a `model` or `model_init` argumentzK`Trainer` requires either a `model` or `model_init` argument, but not both.zThe model you have picked (a  ) cannot be used as is for training: it only computes hidden states and does not accept any labels. You should choose a model with a head suitable for your task like any of the `AutoModelForXxx` listed at https://huggingface.co/docs/transformers/model_doc/autoFhf_device_map)cpudiskr   Tr   xlazZUsing --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags.z.Using fsdp only works in distributed training.quantization_methodaccepts_loss_kwargsc              3   j   K   | ]+  }|j                   t        j                  j                  k(   - y wN)kindinspect	ParameterVAR_KEYWORD).0ks     P/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/trainer.py	<genexpr>z#Trainer.__init__.<locals>.<genexpr>  s*      1<='++7771s   13parallelism_config	deepspeedproblem_typemulti_label_classificationzsLabel smoothing is not compatible with multi-label classification. Disabling label smoothing for this training run.)epsilon)JITCheckpointCallbackexist_ok)is_local_process_zerois_world_process_zerostateful_callbacksconfigxla_fsdp_v2z*FSDPv2 requires `torch_xla` 2.2 or higher.)fsdptensor)
axis_names)loggerinfort   r   full_determinismre   seedrm   r   r   "create_accelerator_and_postprocessr^   skip_memory_metrics_memory_trackerstartget_process_log_levelr   set_verbosity_setup_devicesr   call_model_initRuntimeError
ValueError	__class____name__r6   use_liger_kernelr+   liger_kernel_configrq   is_model_parallelgetattrsetr   valueslendevicer   fsdp_configis_fsdp_xla_enabledr   is_deepspeed_enabledparallel_moders   DISTRIBUTEDplace_model_on_devicefp16_full_evalbf16_full_evalis_fsdp_enabledr   r   BITS_AND_BYTES_move_model_to_device_n_gpumodel_wrappedrp   acceleratorr4   hasattrr   model_accepts_loss_kwargsr   	signatureforward
parametersany
sp_backend
sp_enabledr   label_namesr   label_smoothing_factorr   warningswarnUserWarninglabel_smootherrH   
isinstancer9   r   r   r   r   r   r   r   neftune_noise_alphar   r   r   	optimizerlr_schedulerr   _validate_argsDEFAULT_CALLBACKSr   	report_toenable_jit_checkpointtrainer_jit_checkpointr   set_trainerr:   callback_handleradd_callbackdisable_tqdmr=   DEFAULT_PROGRESS_CALLBACKhub_model_idpush_to_hubinit_hf_reposhould_saveosmakedirsr   r@   controlrA   r   r   r   r<   stateis_in_trainhp_namehp_search_backendcurrent_flos_loggers_initialized_signature_columnstrain_batch_size_train_batch_size_created_lr_scheduleron_init_end	use_cachegetis_fsdp_xla_v2_enabledIS_XLA_FSDPV2_POST_2_2xrglobal_runtime_device_countxsset_global_meshMeshnparrayrangeis_fsdp_xla_v1_enabledstop_and_update_metrics)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   	log_levelr   devicesunwrapped_modelforward_paramspcmodel_to_inspectdefault_label_namesdefault_collatordefault_callbacksr   jit_callbackcbnum_devicess                                r   __init__zTrainer.__init__k  s   D <&JKKKJ<WYZ[$
;D	37993M3M		/S[\`\e\e\j\jSk
 
//13DII4Q4QR""$..0	i( =%",,,."#abb% !noo(DO??##'::-eoo.F.F-G HJ J  99%%udii&C&CD*51 "'5/40<,/0C0C0J0J0L,Mo&QW_nQnvoGo7|a)-&W")-)9)9U\\'RS*=U)U&#'#3#3E#: tyy>A(( p  ##E*t/A/A\E]E]/] !QRR %%1)-)C)CD&""((##t':':''##&().D&)-D&
 &&4d;?Q?`?``&&udkk: !! DII #
 ,D,<,<,I,I%,PQ?$9:-<-P-PD*$../F/FGRRN-0 1AOAVAVAX1 .D*
 T%%';TB>bmm{:r}}-2D*,TZZ8)*:*D*DE26))2G2G2O.UYU^U^UjUj./?/I/IJ99++q0tzz((.$?C__G
 '+#&3DII<\<\&]#"&D  ++.EG_-`a $$45 '	 	 /<.G]M]*( 0#'#;#;  "3.-J* -7))(@% .0STXT]T]TgTg0hh99**E02L 1\N B$$T*)2):%@QT]@]	 /tzz4#8#8$..$J[J[!
 	TYY-C-C/Ibc !99  99  KK		,,t< &'!"&"<"<">"&"<"<">!22<<~M Q[\^`oQp 

 !!%$)!"&!%!6!6%*",,88DJJPTP\P\] 4::x.:*.))*=*=DJJ'&*&6&6&:&:=%&P#&&) !MNN88:Krwwrxxk0B'CkSTEUbtuv&*&>&>&btGbGbCb#446K pJ s   	f-f-f2-f2returnc                    | j                   }t               r|j                  rt        d      |j                  t
        j                  j                  j                  k7  rt        j                  dt
        j                  j                  j                   d|j                   dt
        j                  j                  j                          t
        j                  j                  j                  |_        |j                  rB| j                  6dt        j                  | j                        j                  vrt        d      |j                  4|j                  dk7  r%| j                   t        d	|j                   d
      |j"                  t$        j&                  k(  s|j(                  r|j*                  t        d      | j,                  | j.                  t1        d      | j2                  #| j.                  | j4                  t1        d      t7               r| j.                  || j8                  j                         D ]  }|j:                  } n | j.                  j<                  D ]'  }t?        |d         dkD  s|d   d   j:                  } n k7  rt        d      | j@                  s| jB                  r#| j.                  | j4                  t1        d      tE        | jF                        s+tE        tI        | jF                  dd            rtK        d      |jL                  dkD  r$|jN                  dkD  rt        jQ                  d       | jR                  /tU        | jR                        s|jL                  dk  rt        d      | jR                  ]tW        | jR                  tX        jZ                  j\                  j^                        r$t        jQ                  d|j`                   d       yyy)zJValidate constructor arguments and fail fast on incompatible combinations.zOSageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead z(FP16 provided in SM_HP_MP_PARAMETERS is z+, but FP16 provided in trainer argument is z, setting to Ncompute_resultzWhen using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result` boolean argument which will be triggered after the last batch of the eval set to signal that the summary statistics should be returned by the function.noz%You have set `args.eval_strategy` to zx but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. z`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`.zSPassing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.zPassing a `model_init` is incompatible with providing the `optimizers` argument. You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.paramsr   a[  The model and the optimizer parameters are not on the same device, which probably means you created an optimizer around your model **before** putting on the device and passing it to the `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and `model.to(xm.xla_device())` is performed before the optimizer creation in your script.zPassing `optimizers` is not allowed if PyTorch FSDP is enabled. You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.collate_batchzRThe `data_collator` should be a simple callable (function, class with `__call__`).zHmax_steps is given, it will override any value given in num_train_epochszThe train_dataset does not implement __len__, max_steps has to be specified. The number of steps needs to be known in advance for the learning rate scheduler.zThe `train_sampling_strategy='z'` option is ignored when using an `IterableDataset`. Samplers cannot be used with IterableDataset as they require indexed access to the dataset.)1r   r   bf16r   fp16smpr,  cfgr   warningbatch_eval_metricsr   r   r  r  eval_strategyr   save_strategyr]   BESTload_best_model_at_endmetric_for_best_modelr   r  r   r   r  r   r   r   param_groupsr   r   r  callabler   r   	TypeError	max_stepsnum_train_epochsr   r   rh   r  r   utilsdatar   train_sampling_strategy)rE  r   parammodel_deviceparam_groupoptimizer_devices         r   r  zTrainer._validate_argsg  s   yy #$yy !rssyyCIIMM...>syy}}?Q?Q>R S@@D		{ K""%))--"4"4!57
  IIMM..	 ""t';';'Gw'8'89M9M'N'Y'YY N 
 )d.@.@D.HTM^M^Mf78J8J7K  LD  E  !2!22d6Q6Q))1  Z 
 ((49Stuu??&DNN,F$J[J[Jgj  "#(B..0 $||  $~~:: {8,-1'28'<Q'?'F'F$ // n  $$(<(<NN&$*;*;*Gj  **+ASASUdfj9k0lpqq>>A$"7"7!";KKbc)*T=O=O2PUYUcUcghUhd 
 )j9K9KU[[M]M]MmMm.nKK01M1M0N On n /o)    c                 ~   | j                   j                  | j                   j                  d}|j                  |       | j                   j                  5d}t        |      st        d| d      | j                   j                  |d<   d| _        t        | j                  dd      | j                  j                  d	kD  rd
| _        | j                   j                  St        d      r=| j                   j                  vddlm}  || j                  j                        |d<   nOt        d      |d   j                  | j                  j                  k7  r| j                  j                  |d   _
        t        d      r=ddlm}  || j                   j                   | j                   j"                        }||d<   |S )z>Helper method to build accelerator-specific keyword arguments.)mixed_precisiondeepspeed_pluginNz1.12.0z'ParallelismConfig requires accelerate>=z1). Please upgrade accelerate to use this feature.r   Ftp_sizer   Tr   )ParallelismConfig)rt  z5Requires accelerate>1.12.0 to use Tensor Parallelism.z1.2.0)TorchDynamoPlugin)backendmodedynamo_plugin)r   rr  rs  updater   r   ImportErroris_tp_enabledr   r   rt  r   ru  r   accelerate.utilsrv  torch_compile_backendtorch_compile_mode)rE  kwargsr   min_accelerate_versionru  rv  ry  s          r   _build_accelerator_argszTrainer._build_accelerator_args  s     $yy88 $		 : :
 	F 99''3%-"*+AB!=>T=U  VG  H  *.)E)ED%&"4::y$/;

@R@RUV@V!%Dyy++3*84yy33;@5FtzzOaOa5b12$%\]]*+33tzz7I7II59ZZ5G5G)*2"7+:-		77dii>Z>ZM %2D!rp  c                 0	   i }| j                   j                  j                   | j                   j                  j                  }d|v r9| j                   j                  dkD  rt	        d      |d   | j                   _        n| j                   j                  |d<   t        di |}| j                   j                  j                         }g d}t        di |D ci c]  }||j                  |       c}}| j                   j                  |_	        |j                  d      }|r+| j                   j                  st        j                  d       ||_        |j                  d       d}| j                   j                  "d	d
lm}	  |	di | j                   j                  }| j#                  |||      }
t%        di |
| _        | j&                  j(                  | _        dt-        j.                  | j*                        j0                  v r:t3        j4                  | j*                  | j                   j6                        | _        t9        | j&                  j:                  dd      du| _        t9        | j&                  j:                  dd      du| _        | j>                  r| j&                  j:                  j@                  }dD ]=  }tC        ||| j                   jD                  jG                  |t9        ||                   ? |jH                  r!| j                   jJ                  rt	        d      | j<                  r7t9        | j                   dd       tM        | j&                  | j                          | j                   jN                  rL| j<                  s| j>                  r4| j                   jP                  r| j<                  rdnd}t	        | d      | j<                  rN| j&                  j:                  jR                  jT                  dk(  r!| j                   jV                  rt	        d      | j                   jN                  rN| j>                  rAdtY        | j&                  j:                  j@                  jZ                        v rt	        d      yyyc c}w )zOCreate the accelerator and perform post-creation setup (FSDP, DeepSpeed, etc.).N	num_stepsr   zThe `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`.)split_batchesdispatch_batcheseven_batchesuse_seedable_samplernon_blockingzx`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both.gradient_accumulation_kwargsr   )FullyShardedDataParallelPlugin)dataloader_configfsdp_plugingradient_accumulation_pluginuse_gather_object)r  rs  r  )limit_all_gathersactivation_checkpointingzThe activation_checkpointing in FSDP config and the gradient_checkpointing in training arg can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic when using FSDP.hf_deepspeed_config	DeepSpeedFSDPzJ can't be used with `save_only_model` along with `load_best_model_at_end`.   zo`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDPSHARDED_STATE_DICTzWsave_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT' ).r   accelerator_configr  gradient_accumulation_stepsr   r   to_dictr   pop	data_seeddataloader_pin_memoryr   r]  r  fsdp_plugin_argsr}  r  r  r   r  gather_for_metricsgather_functionr   r  r  	functoolsr   eval_use_gather_objectr   r,  r   r  r  setattrr   r8  r  gradient_checkpointingr(   save_only_modelrb  rs  
zero_stageauto_find_batch_sizestrstate_dict_type)rE  grad_acc_kwargsr  r  dataloader_paramsrl  r  r  r  r  r   wrappers               r   r   z*Trainer.create_accelerator_and_postprocess  s    99''DDP"ii::WWO /)yy44q8 L 
 9H8T		5+/99+P+POK(
 (B'TO'T$!YY99AAC j3 
ARSu(,,U33S
 '+ii&9&9#)--n=		 ? ?NN K *6&=>99%%1G8V499;U;UVK++/#)E , 
 '..#//BB'"3"3D4H4H"I"T"TT#,#4#4$$		8X8X$D 
 %,D,<,<,B,BDVX\$]ei$i!&t'7'7'='=}dS[__ **00<<KJ kUDII,A,A,E,EeWU`bgMh,ijk33		8X8X '  $$<QSW)X)`'(8(8$))D II%%**d.B.B		00%)%>%>kFGy(rstt %%  &&77BBaG		.. B  II%%$$$D,<,<,B,B,N,N,^,^(__vww ` % &U Ts   Rc                     | j                   t        d      | j                  | j                   d| j                  | j                  d      S )a@  
        Returns the training [`~torch.utils.data.DataLoader`].

        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
        training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        z+Trainer: training requires a train_dataset.TrainingT)datasetdescription
batch_size
sampler_fnis_training)r   r   _get_dataloaderr4  _get_train_samplerrE  s    r   get_train_dataloaderzTrainer.get_train_dataloaderS  sU     %JKK##&&"--.. $ 
 	
rp  c                    || j                   t        d      t        |t              r|nd}t	        | d      r3|| j
                  v r%| j                  j                  r| j
                  |   S t        |t              r| j                   |   n||n| j                   }| j                  |d| j                  j                  | j                  |      S )a   
        Returns the evaluation [`~torch.utils.data.DataLoader`].

        Subclass and override this method if you want to inject some custom behavior.

        Args:
            eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*):
                If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed.
        z-Trainer: evaluation requires an eval_dataset.eval_eval_dataloaders
Evaluation)r  r  r  r  dataloader_key)r   r   r  r  r	  r  r   dataloader_persistent_workersr  eval_batch_size_get_eval_sampler)rE  r   r  s      r   get_eval_dataloaderzTrainer.get_eval_dataloaderg  s     D$5$5$=LMM *4L#)FFD-.$"8"88		77)).99 ,, l+ ' "" 	 ## $yy00--) $ 
 	
rp  test_datasetc                 h    | j                  |d| j                  j                  | j                        S )a  
        Returns the test [`~torch.utils.data.DataLoader`].

        Subclass and override this method if you want to inject some custom behavior.

        Args:
            test_dataset (`torch.utils.data.Dataset`, *optional*):
                The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the
                `model.forward()` method are automatically removed. It must implement `__len__`.
        test)r  r  r  r  )r  r   r  r  )rE  r  s     r   get_test_dataloaderzTrainer.get_test_dataloader  s8     ## yy00--	 $ 
 	
rp  
dataloaderc                    	 |j                   }t        |t              rt        |j                   j                         S t        |j                         S # t        t
        t        f$ r% t        |      | j                  j                  z  cY S w xY w)z
        Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When
        dataloader.dataset does not exist or has no length, estimates as best it can
        )	r  r  rG   r   	NameErrorAttributeErrorrf  r   per_device_train_batch_size)rE  r  r  s      r   num_exampleszTrainer.num_examples  sw    
	K ((G'#78:--5566z))**>95 	Kz?TYY%J%JJJ	Ks   :A A 6B
Br  r  r  r  r  r  c                    | j                   }t               r.t        |t        j                        r| j                  ||      }n| j                  | j                   |      }t        j                  j                  j                         xr | j                  j                  dkD  }||| j                  j                  | j                  j                  | j                  j                  |rdndd}	t        |t        j                  j                   j"                        s~| ||      |	d<   | j                  j$                  |	d<   | j                  j&                  |	d<   |r=t)        t*        | j                  j                  | j                  j,                  	      |	d
<   | j.                  j1                  t3        |fi |	      }
|<| j                  j                  r&t5        | d      r|
| j6                  |<   |
S ||
i| _        |
S )zACreate a [`~torch.utils.data.DataLoader`] from the given dataset.)r  r   forkN)r  
collate_fnnum_workers
pin_memorypersistent_workersmultiprocessing_contextsampler	drop_lastprefetch_factor)r  rankworker_init_fnr  )r   r   r  datasetsr   _remove_unused_columns"_get_collator_with_removed_columnsr   r   mpsis_availabler   dataloader_num_workersr  r  ri  rj  r   dataloader_drop_lastdataloader_prefetch_factorr   rl   process_indexr  preparer   r	  r  )rE  r  r  r  r  r  r  r   should_forkr  r  s              r   r  zTrainer._get_dataloader  s    ** "z'8;K;K'L11'{1SG CCDDVDVdoCpM nn((557`DII<\<\_`<` %'99;;))99"&))"I"I1<v$
 '5;;#3#3#C#CD%/9'/B!),-1YY-K-Kk*37993W3W/06=TYY-M-MTXT]T]TkTk7!"23 %%--j.VDU.VW
 %$))*Q*Qt019C&&~6  +9*)E&rp  c                 T   || j                   }|t        |      sy| j                  j                  dk(  rt	               rXt        |t        j                        r>| j                  j                  |j                  v r|| j                  j                     nd}nd}| j                  | j                  j                  d   nd}t        | j                  j                  | j                  j                  z  |||      S | j                  j                  dk(  rt        |      S t!        |      S )z?Return the training sampler based on `train_sampling_strategy`.Ngroup_by_lengthr   r  lengthsmodel_input_name
sequential)r   rh   r   rk  r   r  r  r   length_column_namecolumn_namesr   model_input_namesrI   r3  r  r   r   )rE  r   r  r  s       r   r  zTrainer._get_train_sampler  s     ..M 
=(A 99,,0AA$&:mXEUEU+V yy33}7Q7QQ "$))">">?  >B>S>S>_%%77:ei  (		**TYY-R-RR%!1	  YY..,>$]33 //rp  c                    |t        |      sy| j                  j                  dk(  rt               rXt	        |t
        j                        r>| j                  j                  |j                  v r|| j                  j                     nd}nd}| j                  | j                  j                  d   nd}t        | j                  j                  |||      S | j                  j                  dk  rt        |      S y)zNReturn the evaluation sampler, using sequential ordering when not distributed.Nr  r   r  r   )rh   r   rk  r   r  r  r   r  r  r   r  rI   r  
world_sizer   )rE  r   r  r  s       r   r  zTrainer._get_eval_sampler  s    z,'?99,,0AA$&:lHDTDT+U yy33|7P7PP !!=!=>  >B>S>S>_%%77:ei  (		))$!1	  991$$\22rp  c                    | j                   | j                  }t        | j                        rQt        | j                  d      r| j                  j	                         }n | j                  j
                  j                  }t        j                  |j                        }t        |j                  j                               | _         | xj                   t        t        ddg| j                  z               z  c_         yy)zTPopulate `_signature_columns` from the model's forward signature if not already set.Nget_base_modellabel	label_ids)r2  r   r`   r	  r  
base_modelr   r  r  listr  keysr   r  )rE  rK  r  s      r    _set_signature_columns_if_neededz(Trainer._set_signature_columns_if_needed  s    ""*#zzdjj)4::'78'+zz'@'@'B$ (,zz'<'<'B'B$))*:*B*BCI&*9+?+?+D+D+F&GD###tC+0FIYIY0Y,Z'[[# +rp  zdatasets.Datasetc                    | j                   j                  s|S | j                          | j                  }t	        t        |j                        t        |      z
        }t        |      dkD  r|dnd| d}t        j                  d| d| j                  j                  j                   ddj                  |       d	dj                  |       d
| j                  j                  j                   d       |D cg c]  }||j                  v s| }}t        |      dk(  r0t        ddj                  |       ddj                  |       d      t        j                   t"        j$                        t        j                   d      k  r0|j'                  |j(                  d   ||j(                  d          |S |j+                  |      S c c}w )zBRemove dataset columns not accepted by the model's forward method.r    zin the z setzThe following columns z) don't have a corresponding argument in `z!.forward` and have been ignored: , z. If z are not expected by `z/.forward`,  you can safely ignore this message.zGNo columns in the dataset match the model's forward method signature: (z-). The following columns have been ignored: [zp]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.z1.4.0typeformat_kwargs)r  columnsr  )r   remove_unused_columnsr  r2  r  r   r  r   r   r   r   r   r   joinr   r   parser  r   
set_formatformatremove_columns)rE  r  r  signature_columnsignored_columnsdset_descriptionr   r  s           r   r  zTrainer._remove_unused_columns,  s    yy..N--/ 33s7#7#783?P;QQR!#%0%8r}TX>YKK()9(: ;JJ((1122STXT]T]^mTnSo pyy122HI]I]IfIfHg h77 0M18L8L3L1MMw<1YZ^ZcZcduZvYw x==AYY=W<X Y@@  ==--.w1GG^^F+WGNN[jLk   N))/:: Ns   :GGc                     | j                   j                  s|S | j                          | j                  }t	        ||t
        || j                  j                  j                        }|S )z=Wrap the data collator in a callable removing unused columns.)r   r  r   r  
model_name)	r   r  r  r2  r\   r   r   r   r   )rE  r   r  r  remove_columns_collators        r   r  z*Trainer._get_collator_with_removed_columnsO  s_    yy..  --/ 33"7'/#zz++44#
 '&rp  num_training_stepsc                 H    | j                          | j                  |       y)aZ  
        Setup the optimizer and the learning rate scheduler.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
        `create_scheduler`) in a subclass.
        r  N)create_optimizercreate_scheduler)rE  r  s     r   create_optimizer_and_schedulerz&Trainer.create_optimizer_and_schedulera  s"     	1CDrp  c           
         t               r| j                  n| j                  }| j                  x| j	                  |      }|j                         D cg c]  \  }}||v s|j                  s| c}}| j                  j                  d|j                         D cg c]  \  }}||vs|j                  s| c}}ddg}| j                  | j                  \  }}n| j                  | j                  |      \  }}t        |      r  |       |fi || _        nMd|v r|j                  d      }d|v r|j                  d      }d|v r|j                  d      } ||fi || _        dt        |      v r.|j                  dd      d	k(  rd
dl}|j                   j"                  j%                         }	d
}
|j'                         D ]  }t)        |t*        j,                        s|
t/        |j1                         D ci c]!  }|j3                         |j5                         # c}j7                               z  }
t8        j;                  d| d|
dz   d       |	j=                  |dddi       t8        j?                  d| d        t8        j;                  d|
dz   d       t               r$tA        jB                  | j                        | _        | j                  S c c}}w c c}}w c c}w )aO  
        Setup the optimizer.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method in a subclass.

        Returns:
            `torch.optim.Optimizer`: The optimizer instance.
        N)rW  weight_decay        rW  r   optimizer_dictbitsandbytes
optim_bits   r   zskipped z: i   zM paramsweight    zbitsandbytes: will optimize z in fp32z	skipped: )"r   r  r   r  get_decay_parameter_namesnamed_parametersrequires_gradr   r  r   get_optimizer_cls_and_kwargsrE   r  r  r8  r  optimGlobalOptimManagerget_instancemodulesr  r   	Embeddingsumr  data_ptrnumelr   r   r   register_module_overridedebugr[  DistributedOptimizer)rE  	opt_modeldecay_parametersnpoptimizer_grouped_parametersoptimizer_clsoptimizer_kwargsr  managerskippedmodules               r   r
  zTrainer.create_optimizerl  s    +B*CD&&	>>!#==iH '0&@&@&B"aqL\G\abapap %)II$:$:	 '0&@&@&B"aqP`G`efetet %(	,( ,,8262O2O//262S2STXT]T]_h2i// $M2!0!O>N!O
 //3C3G3G3Q0 ..3C3G3G3P0 $'773C3G3GHX3Y0!./K!`O_!`]!338H8L8L\[_8`de8e#&,,??LLN'//1 VF!&",,73IZIZI\']A

aggi(?']'d'd'f#gghvhb58I$RS88LZ\K]^'CF88%TUV i%'8AB"$ 55dnnEDN~~uV (^s*   K"!K".K"K(,K(9K( &K.r  c                    | j                   |Qt               r;t        j                  j                  j
                  r| j                  j                  }n| j                  }t        | j                  j                  || j                  j                  |      || j                  j                        | _         d| _        | j                   S )ad  
        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
        passed as an argument.

        Args:
            num_training_steps (int): The number of training steps to do.

        Returns:
            `torch.optim.lr_scheduler.LRScheduler`: The learning rate scheduler instance.
        )r  num_warmup_stepsr  scheduler_specific_kwargsT)r  r   r[  r,  r\  rZ  r  r7   r   lr_scheduler_typeget_warmup_stepslr_scheduler_kwargsr5  )rE  r  r  s      r   r  zTrainer.create_scheduler  s     $ *,1C1C $ 8 8I $I -		++#!%!;!;<N!O#5*.))*G*G!D *.D&   rp  c           	      (   t        | |d| j                  i| j                  | j                  f| j                  dt        | j                              }t        j                  | j                        }|t        d| j                          ||      S )a  
        Returns the optimizer class and optimizer parameters based on the training arguments.

        Args:
            args (`transformers.training_args.TrainingArguments`):
                The training arguments for the training session.
            model (`PreTrainedModel`, *optional*):
                The model being trained. Required for some optimizers (GaLore, Apollo, LOMO).

        Returns:
            A tuple containing the optimizer class and a dictionary of optimizer keyword arguments.
        lr)betaseps)r   r   r+  adam_kwargs
optim_argsz2Trainer cannot instantiate unsupported optimizer: )rC   learning_rate
adam_beta1
adam_beta2adam_epsilonrD   r:  rB   r8  r  r   )r   r   ctxhandlers       r   r  z$Trainer.get_optimizer_cls_and_kwargs  s     "D$6$67//4??;(( )9	
 &))$**5?QRVR\R\Q]^__s|rp  c                 F    g d}t        |t        j                  g|      }|S )a-  
        Get all parameter names that weight decay will be applied to.

        This function filters out parameters in two ways:
        1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
        2. By parameter name patterns (containing 'bias', or variation of 'norm')
        )bias	layernormrmsnormz(?:^|\.)norm(?:$|\.)z_norm(?:$|\.))rM   r   	LayerNorm)rE  r   forbidden_name_patternsr&  s       r   r  z!Trainer.get_decay_parameter_names  s'     #q.ur||nF]^rp  c                    | j                   r	 | j                  j                         d   }nrt        | j                  t        j                  j                  j                        r| j                  j                  d   d   }n| j                  j                         d   }t        j                  |      r|j                         }|S # t        $ r0}dt	        |      v rt
        j                  d       d}n Y d}~[d}~ww xY w)z
        Returns the current learning rate from the scheduler.

        Handles DeepSpeed's dynamic loss scaling warmup period where `get_last_lr` may fail.
        r   zneed to call stepzQtried to get lr value before scheduler/optimizer started stepping, returning lr=0Nr6  )r   r  get_last_lrAssertionErrorr  r   r]  r  r   r  ReduceLROnPlateaur  rd  	is_tensoritem)rE  last_lres      r   _get_learning_ratezTrainer._get_learning_rate  s     $$++779!< $++U[[-E-E-W-WX..55a8>++779!<??7#llnG " &#a&0NN#vwG s   C 	C>&C99C>resume_from_checkpointtrial$optuna.Trial | dict[str, Any] | Noneignore_keys_for_evalc                    |du rd}| j                   j                          | j                  }d| _        t	        | j
                  t        t        f      r6t        | j                  d      r t        | j                  | j
                         | j                  0t        | j                  | j                  | j                        | _        |j                  s|j                   r>| j"                  s2| j$                  &| j'                  | j                  |j(                         | j+                  |       | j                  j,                  | _        d}| j$                  {| j                  j0                  rt3        | j                  j4                        nt7        | j                  j4                         | j9                  |      | _	        d}d\  | _        | _        t	        |t>              r2|r0tA        |jB                        }|tE        d|jB                   d      |tG               s)| jH                  s| jJ                  s| jM                  |       tO        jP                  tR        jT                  jW                  |tX                    }|j,                  |jZ                  r|j,                  | _        |rC| j\                  r&| j'                  | j                  |j(                         | j                  | _/        ta        | jb                  | j.                  |jZ                        }|jd                  r5	 tg        jh                           |||||      tg        jj                          S  |||||      S # tg        jj                          w xY w)	a  
        Main training entry point.

        Args:
            resume_from_checkpoint (`str` or `bool`, *optional*):
                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
                of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
            trial (`optuna.Trial` or `dict[str, Any]`, *optional*):
                The trial run or the hyperparameter dictionary for hyperparameter search.
            ignore_keys_for_eval (`list[str]`, *optional*)
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions for evaluation during the training.

        Returns:
            [`~trainer_utils.TrainOutput`]: Object containing the global step count, training loss, and metrics.
        FNTr   r   z/No valid checkpoint found in output directory ())r   rP  rQ  rS  )6r   r   r   r-  r  r   r9   r8   r	  r   ra   r  r,   r  neftune_hook_handler  r  r   r   r  r   _hp_search_setupr3  r4  r   re   r   rm   r   r  r  boolrg   r   r   r   r   r  _load_from_checkpointrA   load_from_jsonr)  pathr  TRAINER_STATE_NAMEr  r   r  rf   _inner_training_loopr&  hf_hub_utilsdisable_progress_barsenable_progress_bars)rE  rP  rQ  rS  r   model_reloadedr,  inner_training_loops           r   trainzTrainer.train  s   . "U*%)" 	""$yy d++.E~-VW\cJJ]
 !T-B-BC ##/'7

DD\D\^b^n^n'oD$ 4#6#6@V@V[_[j[j[r&&tzz4;;? 	e$!%!;!; ??&7;yy7Q7Q#DIINN3W_`d`i`i`n`nWo--e4DJ!N0:-DND- ,d38N%8%I"%- #RSWSbSbRccd!eff!-*,T5N5NW[WkWk**+AB //=SUg0hiE %%1d6O6O).)?)?& ))**4::t{{C!%D8%%t'='=t?X?X
 
4224*+A)=	 113&'=%9	  113s   %M% %M;c                 ^1   | j                   j                          || _        | j                  j                  r| j
                  j                  | j                  k7  rt        | j                         | j                  | _        | j                  r| j                  j                  }| j                  t        d| j                  j                        z  | j                  _        t        | j                   | j                  d       || j                  _        | j                  | j
                  _        t        j!                  d| j                          | j#                         }| j$                  rt'        |      }| j)                  |      }| j+                  |||      \  }	}
}}}}}t,        j.                  | j                  j                   v r9| j                  j                  dkD  rt1        d      t3        | j                         t5               xs | j6                  xs | j8                  }| j8                  xr. t;        | j                   j
                  j<                  dd      dk(  }|rd}| j>                  rd	| _         d| _        | j                  rtC        | |
      \  | _"        | _         |s| jG                          tI        | jJ                  jL                  | jN                  gz   D cg c]  }tQ        |tR              s| c}      | _        |d	u| j
                  _*        | j                  | j
                  _        | j
                  jW                  ||       |jX                  r&| j                  j[                  |j\                         | j_                  | j                        }|| j                  u }|r(| j8                  rta        | j                  d      | _	        |r|rz| j8                  r+tc        |      r te        | j                  | j                          | j                   jf                  dk7  r*| j                   ji                  | j                        | _	        | jG                          |r| j                  jk                          | j                  rddl6m7} tQ        | j@                  |      rJ| j                   ji                  | j                  | jD                  | j@                        \  }| _"        | _         n| j                   ji                  | j                  | jD                        \  }| _"        nc| j                   ji                  | j                  | jD                        \  }| _"        n*| j                   ji                  | jD                        | _"        | jq                  |
       t;        | j                   dd	      }|7|jr                  dk(  r(|jt                  r| j                   jw                  ||      }| j8                  rN|x| _	        | _        ty        | j                  d      r*tz        j|                  j                  | j                  d       || j                  ur|| _        | j                  r| j                  | _@        |k| j                  r-t        | j                  |tc        | j                                n2t5               s| j8                  r| j                  || j                         | j                  |       | j                  |       t        j                  d       t        j                  d|d       t        j                  d|	d       t        j                  d| j                  j                  d       | j                  j                  | j                  k7  r#t        j                  d| j                  d       t        j                  d|d       t        j                  d|j                          t        j                  d|d       t        j                  dt        |d      d       d| j
                  _H        t        j                         }| j
                  j                  | _K        d}d}|t        j                  j                  t        j                  j                  |t                    rFtI        j                  t        j                  j                  |t                    | _        t        | j                  | j
                         | j                          t        | j
                  j                  |
z        }|j                  s)| j
                  j                  |
z  }||j                  z  }nd}t        j                  d        t        j                  d!|        t        j                  d"| j
                  j                          |j                  st        j                  d#| d$| d%       d&D ]#  }t        | jJ                  |t;        | |             % || jJ                  _X        | j
                  j                  | ||	|       t        j                  d'|j                  (      }d'| _]        | j
                  j                  | _^        |j                          d	}d	}| jJ                  j                  || j
                  | jN                        | _'        |j                  r| j                  ||d)       t        ||	      D 
]  }|}|t        |      n|j                  |j                  z  } | jJ                  j                  || j
                  | jN                        | _'        d*}!d}"||k(  r=|;|dkD  r |j                  st        ||      }|dz
  }!d}"n|dk(  r| j                  |       ty        |d+      r|j                  |       t        |      }#| |j                  z  }$|$dk(  r|j                  }$d*}%| |j                  z  t        |$|j                  k        z   }&t        |&      D ]]  }'|%dz  }%|%|&dz
  k7  r|j                  n|$}(| j                  |#|(|j                        \  })}*t        |)      | _l        t        |)      D ]  \  }+},|!dz  }!|!dz   |j                  z  dk(  xs |!dz   | k(  }-| j                   j                  j                  |-       | j                  j                  d,k7  rt;        | j                  d-d.      }.|.|,vrt        j                  d/       n^| j                  j                  d0k(  rd1|,v r|,d1   j                         }/n| j                  Wty        | j                  d2      rA| j                  j                  +|,|.   | j                  j                  k7  j                         }/n<t        j                  d3       |,|.   j                         }/n|,|.   j                         }/t        j                  |/| j                  j                  t        j                  4      }/| j
                  xj                  | j                   j                  |/      j                         j                         z  c_J        |"r| j                  |       d}"|!|j                  z  dk(  r6| jJ                  j                  || j
                  | jN                        | _'        | j                   j                  j                  j                  d5d      s8| j                   j                  t        j                  k(  s|+t        |)      dz
  k(  rt        j                   }0n.t        j                  | j                   j                  |6      }0 |0       5  | j	                  ||,|*      }1d	d	d	       |j
                  rdt               sYt        j                  1      st        j                  |1      r-||d| j
                  j                  z   | j                  z
  z  z   }nC|j                  1j                  k7  r%t1        d7|j                   d8|1j                         ||1z   }| xj                  t        | j                  |,            z  c_        |-r| j                   j                  j                  d       |j                  |j                  dkD  rt5               r5|j                  r(| jD                  j                  |j                        }2nmt        j                   }3| j                  rdd9lm}4 |4}3 |3       5  | j                   j%                  |j'                         |j                        }2d	d	d	       | j                   j                  t        j                  k(  r.|j)                         }ty        |d:      r|j                         }n2}| jJ                  j+                  || j
                  | jN                        | _'        t        j                   }5| j                  rdd9lm}4 |4}5 |5       5  | jD                  j-                          d	d	d	       | jJ                  j/                  || j
                  | jN                        | _'        | j1                         }| j                   j2                  sUtQ        | j@                  t        j4                  j@                  j6                        s| j@                  j-                          |j                          | j
                  xj                  dz  c_U        ||!dz   | z  z   | j
                  _H        | jJ                  j9                  || j
                  | jN                        | _'        | j;                  ||||||||;       n7| jJ                  j=                  || j
                  | jN                        | _'        | jN                  j>                  s| jN                  j@                  st               rtC        jD                           n | jN                  j>                  s| jN                  j@                  s=t               rtC        jD                           n |!dk  rAt        j                  d<| j
                  j                   d=| d>       d| jN                  _        | jJ                  jG                  || j
                  | jN                        | _'        | j;                  ||||||||;       t,        jH                  | j                  j                   v rLt               r,tC        jJ                  tM        jN                                nt        j                  d?       | jN                  j@                  s
 n t        j                  d@       |jP                  r(| j
                  jR                  | jU                          | xj                  |j                         z  c_]        t        | j
                  j                  dA      }6| j                  |6z  }7tW        dB||| j
                  j                  C      }8| jY                          | j
                  jZ                  |8dD<   |7|8dE<   d| _        | j^                  ja                  |8       | jc                  |8       | je                  |      }9tg        |9| j
                  jR                  F      }:| j                  jh                  r| j
                  jR                  | j                  jj                  dk(  rp|:D ]k  };t        j                  jm                  |;| j
                  jR                        r:t        j                  dG|; dH       to        jp                  |;dI       m | jJ                  js                  || j
                  | jN                        | _'        | ju                          | jv                  -ty        | j                  | jz                  | j                          t}        | j
                  j                  |7|8      S c c}w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   4xY w)Jz\Run the actual training loop: forward, backward, optimizer step, logging, and checkpointing.r   T)r  z)Currently training with a batch size of: z\Currently --debug underflow_overflow is not supported under DP. Please use DDP with torchrunfsdp_version   FNr	  )r   )gradient_checkpointing_kwargs)	recursivefp8r   )DummySchedulerr   r   generateload_module_strictz***** Running training *****  Num examples = ,z  Num Epochs = z(  Instantaneous batch size per device = zA  Training with DataParallel so batch size has been adjusted to: zE  Total train batch size (w. parallel, distributed & accumulation) = z   Gradient Accumulation steps = z  Total optimization steps = z#  Number of trainable parameters = )trainable_onlyzE  Continuing training from checkpoint, will skip to saved global_stepz!  Continuing training from epoch z'  Continuing training from global step z  Will skip the first z epochs then the first z batches in the first epoch.)r   r  r  r  r   )skip_scheduler	set_epochrV  main_input_name	input_idszTried to track the number of tokens seen, however the current model is not configured properly to know what item is the input. To fix this, add a `main_input_name` attribute to the model class you are using.non_paddingattention_maskpad_token_idz\Could not determine method to count non-padding tokens, falling back to counting all tokens.)r   dtypesync_each_batch)r   z0Calculated loss must be on the original device: z but device in use is )implicit_replicationrL  )r;  zXThere seems not to be a single sample in your epoch_iterator, stopping training at step zI! This is expected if you're using an IterableDataset and set num_steps (z.) higher than the number of available samples.zYou enabled PyTorch/XLA debug metrics but you don't have a TPU configured. Check your training configuration if this is unexpected.zU

Training completed. Do not forget to share your model on huggingface.co/models =)

gMbP?rc  num_samplesr  
total_flos
train_loss)r   best_model_checkpointzDeleting older checkpoint [z] due to args.save_total_limit)ignore_errors)r  free_memoryr4  r   r  r,  r3  r   r  r   r   r  maxn_gpur(   r   r#  r  r9  r0   get_total_train_batch_sizeset_initial_training_valuesr   UNDERFLOW_OVERFLOWr   r   r   r   r  r   r  r5  r  r$   r  r
  rA   r!  r   r+  r  r<   is_hyper_param_searchcompute_stepsr  gradient_checkpointing_enablerg  _wrap_modelr4   r`   r*   rr  r  rc  r}  rj  r  r  r  deepspeed_ulysses_dl_adapterr	  distr   register_fsdp_forward_methodr   r%   rY  _load_optimizer_and_scheduler_load_scalerr   r  rL   epochtimenum_input_tokens_seen)initial_num_input_tokens_seen_for_sessionr)  r[  isfiler  r\  rZ  rb   _load_callback_stateintglobal_stepignore_data_skipr  train_dataloaderinit_training_referencesr   r   r   _total_loss_scalar_globalstep_last_logged	zero_gradon_train_begineval_on_start	_evaluaterB  r   rg  on_epoch_beginr   _load_rng_statert  iterget_batch_samples#current_gradient_accumulation_steps	enumerategradient_state_set_sync_gradientsinclude_num_input_tokens_seenr]  r  r   ry  r!  int64gatherrL  on_step_beginplugin_kwargsr8  distributed_typer   	DEEPSPEED
contextlibnullcontextr  r   no_synctraining_steplogging_nan_inf_filterr   isnanisinfr0  floatfloating_point_opsmax_grad_normrZ  clip_master_gradsr|  &torch.distributed._tensor.experimentalr|  clip_grad_norm_r  get_global_grad_normon_pre_optimizer_stepstepon_optimizer_steprO  optimizer_step_was_skippedr  rJ  on_step_end_maybe_log_save_evaluateon_substep_endshould_epoch_stopshould_training_stopxm	mark_stepon_epoch_endTPU_METRICS_DEBUGmaster_printmetmetrics_reportrb  r  _load_best_modelro   
store_flosr  r-  r   rD  log_get_output_dirrn   r(  save_total_limitsamefileshutilrmtreeon_train_end_finish_current_pushr  r-   rV  r_   )<rE  r  r   rP  rQ  rS  original_bsr  total_train_batch_sizerh  num_update_steps_per_epochr  num_train_samplesepoch_basedlen_dataloaderrg  delay_optimizer_creationis_fsdp2rP  r   use_accelerator_preparerj  rJ  
start_timeepochs_trainedsteps_trained_in_current_epochattrtr_loss	grad_normr;  r  epoch_dataloadersteps_in_epochr  rng_to_syncepoch_iterator	remainderupdate_steptotal_updates_num_batchesbatch_samplesnum_items_in_batchiinputsdo_sync_stepru  input_tokenssync_contexttr_loss_step
_grad_normgrad_norm_contextr|  contexteffective_global_stepr  metricsrun_dircheckpoints_sorted
checkpoints<                                                               r   r]  zTrainer._inner_training_loop  s[    	$$&!+99))zz**d.D.DDt112%)ZZ" ,,"&))"G"GK<@<R<RVYZ[]a]f]f]l]lVm<mDII9/0@0@$))bfg<GDII9*.*@*@DJJ'@AWAW@XYZ446&&23CD "&!@!@!F ,,T3CE[\	
& ))TYY__<yy" !r  'tzz2#:#<#p@X@X#p\`\p\p  ''qWT5E5E5K5K5W5WYgij-kop-p',$ %% $D).D&$$0>tXa0b-DND-'!!#!!22<<~M Q[\^`oQp 


 ,1+<

(&*&<&<

# 	

  y1 &&JJ44SWSuSu4v  !3!34
 #(4::"5"t';'; &djjDADJ#&''N5,A+DJJ8H8HI##33u<!%!1!1!9!9$**!EDJ!!# #JJ((;d//@?C?O?O?W?W

DNND4E4E@<E4>4+< -1,<,<,D,DTZZQUQ_Q_,`)E4>(,(8(8(@(@T^^(\%t~!--55dnnEDN 	; T%%';TB>bmm{:r}}#//LLM]_de.33DJ+tzz:.		66tzz:N 

"!&D $$!//DN "-(()&&(>WefjfpfpWqSq )*d.B.B**+A4CUCUV 	**+AB01 	23'Q'789o&6q%9:;>tyy?d?def>ghi9900D4J4JJKK[\`\r\rst[uvw[\rst[uvw6t7W7W6XYZ3Ia=AB9:OPUfj:klm9nop

YY[
9=9Y9Y6)*& "-"''..GGLL/1CD3
 &44RWW\\BXZl5mnDJ/		4::F%%' !7!7;U!UVN((151G1GKe1f..$2R2RR.12.KK_`KK;N;KLMKKA$**BXBXAYZ[((,^,< =677SU ; 	FDD))4t1DE	F1A.

++D)=MuU ,,s4;;7"%'+zz'='=$"&	,,;;D$**dll[NN5"6tNL>+;< V	E/ "- $%^^d&F&FF 
  00??djjRVR^R^_DLDK &+A+M1A5d>S>S'9:JLj'k$9A=D"&K3q8(()?@'5 **51!"23N&)I)IIIA~ <<	K*d.N.NNQTD<<<R M =) \q BMR_bcRcBdd>>js484J4J>[fhlhshs4t11 <?};M8!*=!9 PIAvAID$(1H0P0P#PTU#U#uZ^abZbguYuL$$33GGUyy>>$F*1$**>OQ\*]*&8"NN!b  $yyFF-W#3v#=39:J3K3O3O3QL$($9$9$E(/0E0E~(V(,(=(=(J(J(V )/(?4CXCXCeCe(e&)ce %1 %+NN )G%& 4:/3J3P3P3RL/5o/F/L/L/N+0<<TYYM]M]ejepep+qL JJ<<@P@P@W@WXd@e@i@i@k@p@p@rr<",,-CD&+d>>>!C'+'<'<'J'J4QUQ[Q[]a]i]i'j ((77EEIIJ[]bc++<<@Y@YYM 2Q 66'1'='='0'8'89I9I9Q9QY^'_% ]'+'9'9%I['\] 33 6 8"[[6%++l:S #*Gq4::;Q;Q7QTXTpTp7p,q"q">>\-@-@@","RSZSaSaRbbx  zF  zM  zM  yN  !O#  #*L"8%%t/F/Fv/N)OO%#((77KKDQ  --9d>P>PST>T68TYY-1^^-M-MdN`N`-a
4>4J4J 1#'#5#5$k8L$5%6%8 !&151A1A1Q1Q(-(8(8(:(,(:(:2&J!&  $//@@OD]D]],1,F,F,H	#*9f#=090@I,6	'+'<'<'R'RSWY]YcYceieqeq'r","8"8--c&:G$Y 2 NN//12 (,'<'<'N'NtUYU_U_aeamam'n )-(?(?(A#//JJ#-d.?.?AYAYAkAk#l $ 1 1 6 6 8)

..!3.+0D1H3N+N

('+'<'<'H'Htzz[_[g[g'h55#%!!!0&*7 6 	 (,'<'<'K'KDRVR\R\^b^j^j'k
 ||559Z9Z13LLNaPd <<11T\\5V5V-/y\z ax

../ 0##,+-[]
 59100==dDJJPTP\P\]DL))E5%9Mziv *  ,,		?)+OOC$6$6$89NN_ ||00mV	p 	op&&4::+K+K+W!!# 	7<<>1 #DJJ$:$:E B,,/DD
)jj**	
 	 $

 5 5 * 44W=&&u--djj6V6V

 99  TZZ%E%E%QVZV_V_VpVptuVu0 B
ww''
DJJ4T4TUKK"=j\Ig hiMM*DAB
 ,,99$

DLLY 	!!# ##/tzz4+C+CTEUEUV4::11:wGGA n] ]B!& !&,2 2s=   $Ab:Ab%Ab	E8Ab	IAb"	bAbbAbb"Ab,r  r  c                 2   | j                  ||      \  }} |       5  |j                          t        | j                  d      r9t	        | j                  j                        r| j                  j                          | j                  |      }t               rkt        ||| j                  j                        }|j                         j                         j                  | j                  j                        cddd       S | j                         5  | j                  |||      }ddd       ~| j                  j                   :| j"                  j$                  | j                  j                   z  dk(  r
t'                i }| j                  j(                  t*        j,                  t*        j.                  fv r| j1                         |d<   | j                  j2                  dkD  rj5                         }| j6                  r|| j8                  | j:                  z  }| j<                  j>                  t@        jB                  k(  rd|d<    | j<                  jD                  fi | |j                         cddd       S # 1 sw Y   ZxY w# 1 sw Y   yxY w)	ak  
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to train.
            inputs (`dict[str, torch.Tensor | Any]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.

        Return:
            `torch.Tensor`: The tensor with training loss on this batch.
        rc  Nr  r   r;  r   Fscale_wrt_gas)# _prepare_context_parallel_inputsrc  r	  r  re  _prepare_inputsr   r   r   r  reduce_meandetachtor   compute_loss_context_managercompute_losstorch_empty_cache_stepsr,  r  r   r  rr   LOMOADALOMOrO  r  meanr
  r   r  r  r  r   r  backward)rE  r   r  r  
cp_contextloss_mblossr  s           r   r  zTrainer.training_stepx  s    2 "BB5&Q
F \ )	!KKMt~~w/HT^^=Q=Q4R$$&))&1F&(.ufdii>c>cd**,335889I9IJ)	! )	! 224 _((K](^_ 		11=JJ**TYY-N-NNRSS"$F yy>#6#68N8N"OO*.*A*A*C'yy"yy{ 226H6PVZVlVlVtdFFF 00O4M4MM*/'%D%%d5f5;;=S)	! )	!_ _)	! )	!s+   CJJJ (EJ J
	JJreturn_outputsc           	         t        | j                  dd      }|J|j                  dk(  r;|j                  r/| j                  j
                  rt        | j                  ||||      S | j                  | j                  d|v r|j                  d      }nd}| j                  ri }|||d<   i ||} |di |}| j                  -|t        j                  d       | j                  |||      }	n|| j                  j                  |      }
t        |
      r$|
j                  j                  j!                         n|
j!                         }|t#        j$                         v r| j                  ||d	      }	n| j                  ||      }	nzt'        |t(              rPd
|vrLt+        ddj-                  |j/                                ddj-                  |j/                                d      t'        |t(              r|d
   n|d   }	| j0                  j2                  rb| j                  s| j                  rJ|H|	| j0                  j4                  dk  r| j                  j6                  n| j0                  j4                  z  }	|r|	|fS |	S )a  
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Args:
            model (`nn.Module`):
                The model to compute the loss for.
            inputs (`dict[str, torch.Tensor | Any]`):
                The input data for the model.
            return_outputs (`bool`, *optional*, defaults to `False`):
                Whether to return the model outputs along with the loss.
            num_items_in_batch (Optional[torch.Tensor], *optional*):
                The number of items in the batch. If num_items_in_batch is not passed,

        Returns:
            The loss of the model along with its output if return_outputs was set to True

        Subclass and override for custom behavior. If you are not using `num_items_in_batch` when computing your loss,
        make sure to overwrite `self.model_accepts_loss_kwargs` to `False`. Otherwise, the loss calculating might be slightly inaccurate when performing gradient accumulation.
        r   Nr   labelsr  z|Trainer: `compute_loss_func` is defined but `labels=None`. Your custom loss function will still be called with labels=None. r  T)shift_labelsr  zJThe model did not return a loss from the inputs, only the following keys: ro  z,. For reference, the inputs it received are .r   r   r  )r   r  r  r  r   trainingr&   r  r   r  r
  r   r]  r4   r`   r  	_get_namer5   r   r  dictr   r  r  r   average_tokens_across_devicesr  num_processes)rE  r   r  r  r  rJ  r  r  outputsr  rH  r  s               r   r  zTrainer.compute_loss  s{   4 T%%';TB>bmm{:r}}QUQ[Q[QdQd,T-=-=ufn^`aa+t/E/E/QW_ciWiZZ)FF))F!-/A+,))&)F/&/ !!-~X ))#5 * D "..;;EBO "/2  **00::<$..0 
 >EEGG**7F*N**7F;'4(V7-B `xx/00\]`]e]eflfqfqfs]t\uuvx 
 '1$&?76?WQZD II33//43I3I".diioo6JD$$22PTPYPYP_P__D"0g:d:rp  c                     t        j                         }| j                         }t        |t         j                        s|j                  |       |S )zF
        A helper wrapper to group together context managers.
        )r  	ExitStackautocast_smart_context_managerr  r  enter_context)rE  	ctx_stackautocast_ctxs      r   r
  z$Trainer.compute_loss_context_manager  sD     ((*	::<,
(>(>?##L1rp  cache_enabledc                 *    t        j                         S )z
        A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
        arguments, depending on the situation. We rely on accelerate for autocast, hence we do nothing here.
        )r  r  )rE  r%  s     r   r!  z&Trainer.autocast_smart_context_manager  s    
 %%''rp  r  r  r  r  r;  c	                    | j                   j                  rL| j                  j                  | j                  kD  r(t               rt        j                          i }	t        || j                  j                        j                         j                         }
||z  }|
| j                  j                  | j                  z
  z  |	d<   |/t        |t        j                        r|j                         n||	d<   |||	d<   n| j!                         |	d<   | xj"                  |
z  c_        | j                  j                  | _        | j%                          | j'                  |	|       d}| j                   j(                  r]| j+                  ||      }| j-                  ||      }| j                  j.                  t0        j2                  k(  r|| j                   _        | j                   j4                  rS| j7                  ||       | j8                  j;                  | j                  | j                  | j                         | _         yy)z\Log metrics, run evaluation, and save checkpoints if the current training state requires it.r  Nr  r;  )r  rQ  )r+  
should_logr,  r  r  r   r  r  rP   r   r   r  rL  r  r   TensorrO  r  r  r  should_evaluater  _determine_best_metricr`  r]   ra  r(  _save_checkpointr!  on_save)rE  r  r  r   rQ  r  rS  r  r;  logstr_loss_scalarr  is_new_best_metrics                r   r  z Trainer._maybe_log_save_evaluate%  s    <<""tzz'='=@\@\'\%'%'D +7DII4K4KLQQSXXZN wG)TZZ-C-CdFbFb-bcDL$8B9ell8[INN$4aj[!((5_%(,(?(?(A_%##~5#+/::+A+AD(OOHHT:&<<''nnU,@AG!%!<!<WTY!<!Zyy&&,*;*;;+=(<<##!!%/0088DJJPTP\P\]DL $rp  r  r  r   c                     g }t        |      D ]  }	 |j                  t        |              | j	                  ||      }||fS # t        $ r Y  "w xY w)z
        Collects a specified number of batches from the epoch iterator and optionally counts the number of items in the batches to properly scale the loss.
        )rB  appendnextStopIteration_get_num_items_in_batch)rE  r  r  r   r  r  r  s          r   r  zTrainer.get_batch_samplesX  sj     {# 	A$$T.%9:	 "99-P000	 ! s   A	AAr  c                 b   d}t        |      dkD  xr% d|d   v xr | j                  xs | j                  du}|r	 t        d |D              }|I| j                  j                  rR| j                  j                  dkD  rk| j                  j                  |j                  |            j                         }n2| j                  j                  dkD  r|| j                  j                  z  }t        j                  |      r|j                  |      }| j                  j                  dkD  rH|j                         dk(  r5|j!                  d      j#                  | j                  j                  d      }t%        | j                  dd      x}r||j&                  z  }|S # t        t
        f$ r Y `w xY w)a  
        Counts the number of items in the batches to properly scale the loss.
        Args:
            batch_samples (`list`): List of batches
            device (`torch.device`): The device on which the number of items in the batch should be.
        Returns:
            None if the number of items in the batch doesn't need to be computed else the number of items in the batch
        Nr   r  c              3   `   K   | ]&  }|d    j                  d      j                          ( yw)r  N)ner  )r   batchs     r   r   z2Trainer._get_num_items_in_batch.<locals>.<genexpr>  s(     (ee%/*<*<T*B)G)G)I(es   ,.r   rs  r   )r   r
  r   r  rf  r  r   r  r  r  r  r	  r  r   rK  dim	unsqueezeexpandr   non_data_parallel_size)rE  r  r   r  count_num_items_in_batchrJ  s         r   r5  zTrainer._get_num_items_in_batchi  s    "" M!,,
 .. 6 ))5 	! $%((eWd(e%e" )yy6699''!+)-)9)9)@)@ASAVAVW]A^)_)c)c)e&1$ &8499??%J"12%7%:%:6%B"99??Q&+=+A+A+Cq+H);)E)Ea)H)O)OPTPYPYP_P_ac)d& !1!13GNN2N);r?X?X)X&!!- ~. s   F F.-F.rj  c           
          t        |t              rB t        |      |j                         D ci c]  \  }}| j	                  |       c}}      S t        |t
        t        f      r t        |       fd|D              S t        |t        j                        rd j                  j                  i} j                  rst        j                  |      st        j                  |      rI|j                  d j                  j                   j"                  j$                  j'                         i        |j(                  di |S |S c c}}w )z|
        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
        c              3   @   K   | ]  }j                  |        y wr   )_prepare_input)r   vrE  s     r   r   z)Trainer._prepare_input.<locals>.<genexpr>  s     Cd11!4C   r   rz  r  )r  r   r  itemsrB  tupler  r   r)  r   r   r   is_floating_point
is_complexrz  r  r,  rs  hf_ds_configrz  r	  )rE  rj  r   rC  r  s   `    r   rB  zTrainer._prepare_input  s    dG$4:TZZ\RTQq$"5"5a"88RSSudm,4:CdCCCell+		 0 01F((e.E.Ed.KuO_O_`dOe w(8(8(>(>(O(O(\(\(b(b(def477$V$$ Ss   E	
c                     | j                  |      }t        |      dk(  r(t        ddj                  | j                         d      |S )z
        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
        handling potential state.
        r   zThe batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: ro  r  )rB  r   r   r  r2  )rE  r  s     r   r  zTrainer._prepare_inputs  sY    
 $$V,v;!IILRVRiRiIjHkkln 
 rp  c                    t        | j                  dd      | j                  j                  j                  r| j                  j                  j                  dk(  rt        |d      r<|j                  j                  dk7  r#t        d|j                  j                   d      d|vr\t        j                  d	       d
|v rCd}t        j                  j                  |d
   d|      }|ddddf   j                         |d<   d|vrpt        j                  d       t        j                   |d   j#                  d      |d   j$                        j'                  |d   j#                  d      d      |d<   g }g }d|v r%|j)                  |d          |j)                  d       d
|v r%|j)                  |d
          |j)                  d       d|v r%|j)                  |d          |j)                  d       d|v rwt        | dd      s"|d   }t+        |      st        d      d| _        | j,                  r<|d   }|j/                         dk(  r#|j)                  |       |j)                  d       n	 d|v r*|d   %|j)                  |d          |j)                  d       t1        | j                  j2                  ||t5        |            |fS t6        j8                  |fS )as  
        Prepare inputs for context parallelism by setting up buffers and validation.

        Args:
            model: The model being trained
            inputs: Input tensors to prepare

        Returns:
            tuple: (context_manager, prepared_inputs) where context_manager is either
                   the context parallelism wrapper or a no-op context
        r   Nr   r   sdpazIContext parallelism is supported only with SDPA attention, you are using r  r  z7Shift labels not found in the inputs, shifting manuallyr  r8  )r   r   )valuer   position_idsz9Position IDs not found in the inputs, generating manuallyrv  rq  r   rs  rx  _attn_mask_causal_checkedFzContext parallelism only supports causal attention masks. The provided attention_mask is not causal. Please ensure your data uses causal masking (lower triangular) or remove the attention_mask to use the model's default causal masking.Trf  )buffersbuffer_seq_dimsno_restore_buffers)r   r  r   
cp_enabled
cp_backendr	  r   _attn_implementationr   r   warning_oncer   
functionalpad
contiguousr   arangesizer   r=  r2  rN   rO  r;  r   maybe_context_parallelr   r  r  )rE  r   r  _ignore_indexr  rP  rQ  rx  s           r   r  z(Trainer._prepare_context_parallel_inputs  s    D$$&:DAM  33>>22==H5(+||88FB(ghmhtht  iJ  iJ  hK  KL  M  "/''(ab6)(,!#!2!26(3CVS`!2!a17121I1I1K~. V+##$_`).;',,Q/{8K8R8R*&,11!4b9 ~& G Of$vk23&&q)6!vh/0&&q)'vn56&&q)6)t%@%H &,,<%=N3NC(f  6:D211%+,<%=N%))+q0~6'..q1 'F>,B,Nvn56&&q)  77 /#&w<	
   %%v--rp  r  c                    |j                   }|dk  }t        |      rt        |      nd}| j                         }|dkD  r|||z  }|Yt	        ||j
                  z  t        ||j
                  z  dkD        z   d      }|r"t        j                  |j                  |z        }|r{| j                  |      }	|j                   dkD  r|z  t        ||z  dkD        z   }
||z  }nt        j                  |j                        }
| j                  |      |j                  z  }nX|j                   dkD  r1t        j                  }
|}||j                   z  }	|j                   |z  }nt        d|j                          |
|	||||fS )a  
        Calculates and returns the following values:
        - `num_train_epochs`
        - `num_update_steps_per_epoch`
        - `num_examples`
        - `num_train_samples`
        - `epoch_based`
        - `len_dataloader`
        - `max_steps`
        r   Nr   zYargs.max_steps must be set to a positive value if dataloader does not have a length, was )rg  rh   r   get_sp_sizer  r  r  mathceilrh  r  sysmaxsizer   )rE  r   r  r  rg  r  r  sp_sizer  r  rh  r  s               r   r  z#Trainer.set_initial_training_values	  s    NN	!m,6z,BZ ""$Q;>5+g5N %),$"B"BBnt'G'GG!KLM*&  IId&;&;>X&XY	 ,,Z8L~~!#,0J#JS ::Q>N $ 
 %.0F$F!#'99T-B-B#C $($5$5j$ADDYDY$Y!^^a"{{)2&1DNNBL $1G GNN#% 
 &
 	
rp  c                     |j                   | j                         z  | j                         z  | j                         z  }| j                  |j
                  z  |z  S )aC  Calculates total batch size (micro_batch * grad_accum * dp_world_size).

        Accounts for all parallelism dimensions: TP, CP, and SP.

        Formula: dp_world_size = world_size // (tp_size * cp_size * sp_size)

        Where:
        - TP (Tensor Parallelism): Model layers split across GPUs
        - CP (Context Parallelism): Sequences split using Ring Attention (FSDP2)
        - SP (Sequence Parallelism): Sequences split using ALST/Ulysses (DeepSpeed)

        All dimensions are separate and multiplicative: world_size = dp_size * tp_size * cp_size * sp_size
        )r  get_tp_sizeget_cp_sizer_  r4  r  )rE  r   dp_world_sizes      r   r  z"Trainer.get_total_train_batch_size`	  sT     4+;+;+==AQAQASSW[WgWgWii%%(H(HH=XXrp  c                 v    t        | j                  dd      y| j                  j                  }|j                  S )zGet the sequence parallel sizer   Nr   )r   r  r   rd  rE  rJ  s     r   r_  zTrainer.get_sp_sizer	  6    4##%94@H!!44B::rp  c                 v    t        | j                  dd      y| j                  j                  }|j                  S )zGet the context parallel sizer   Nr   )r   r  r   cp_sizerj  s     r   rg  zTrainer.get_cp_sizez	  rk  rp  c                     t        | j                  dd      x}|S | j                  rEt        | j                  dd      x}r,|j                  j                  di       j                  dd      S y)zGGet the tensor parallel size from either the model or DeepSpeed config._tp_sizeNr  tensor_parallelautotp_sizer   )r   r   r   r   r   r8  )rE  model_tpdeepspeed_configs      r   rf  zTrainer.get_tp_size	  ss      

J==HJO $$gdiiQfhl>m*m*:*m#**../@"EII-YZ[[ rp  r  c                    t               ret        | j                  t        j                  j
                        r| j                  S t        j
                  || j                  j                        S | j                  j                  |d      |ur|S | j                  j                  dkD  r"t        |dd      st        j                  |      }|s|S | j                  r*t        || j                  | j                         x| _        }|S t#               r@t        j$                  j'                  |t)        t+        j,                  d            g      }|S | j                  j.                  t0        j2                  k(  rt5               r|S i }| j                  j6                  | j                  j6                  |d<   n&t        |t8              r|j:                   |d<   nd	|d<   | j                  j<                  | j                  j<                  |d
<   | j                  j>                  | j                  j>                  |d<   tA        di || j                  _!        |S )zMWrap `model` for distributed training if needed (DDP, FSDP, SageMaker, etc.).)backward_passes_per_stepFkeep_torch_compiler   is_loaded_in_8bitSMDATAPARALLEL_LOCAL_RANK)
device_idsfind_unused_parametersTbucket_cap_mbbroadcast_buffersr  )"r   r  r  r[  r   DistributedModelr   r  r  r4   r  r   r   DataParallelr   r1   r9  r   parallelDistributedDataParallelr  r)  getenvr   rs   r   r   ddp_find_unused_parametersr3   is_gradient_checkpointingddp_bucket_cap_mbddp_broadcast_buffersr   ddp_handler)rE  r   r  r  r  s        r   r  zTrainer._wrap_model	  s   "$$,,cii.H.HI)))''		HmHmnn ((5(IQVVL 99??Qwu6I5'QOOE*E L ##!4UDIItGbGb!ccDJ4 3 %&KK773ryy1L'M#N"O 8 E0 + YY$$(@(@@,.Fyy33?37993W3W/0E?3 8=7V7V3V/037/0yy**6*.))*E*E'yy..:.2ii.M.M*++H+R6+RD(rp  ignore_keysmetric_key_prefixc                    |du}|r|n| j                   }t        |t              rJi }|j                         D ]3  \  }}| j	                  |r|n||| d|       }|j                  |       5 |S | j                  j                          | j                  |      }	| j                  rt        |	      }	t        j                         }
| j                  |	d| j                  dnd||      }| j                  j                  | j                  j                   z  }| d|j"                  v r|
|j"                  | d   z  }
|j"                  j                  t%        ||
|j&                  t)        j*                  |j&                  |z                     | j-                  |j"                         t.        j0                  | j                  j2                  v r't5        j6                  t9        j:                                | j<                  j?                  | j                  | j@                  | jB                  |j"                        | _!        | j                  jE                  |j"                         |j"                  S )	a  
        Run evaluation and returns metrics.

        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
        (pass it to the init `compute_metrics` argument).

        You can also subclass and override this method to inject custom behavior.

        Args:
            eval_dataset (`Dataset` | dict[str, `Dataset`], *optional*):
                Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
                not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will
                evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the
                `__len__` method.

                <Tip>

                If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run
                separate evaluations on each dataset. This can be useful to monitor how training affects other
                datasets or simply to get a more fine-grained evaluation.
                When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one
                of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets
                `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the
                loss on `data1` and `metric_for_best_model="eval_data2_loss"` for the loss on `data2`.

                </Tip>

            ignore_keys (`list[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "eval_bleu" if the prefix is "eval" (default)

        Returns:
            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
            dictionary also contains the epoch number which comes from the training state.
        Nr  )r   r  r  r  T)r  prediction_loss_onlyr  r  _model_preparation_timer}  )#r   r  r  rE  evaluaterz  r   r   r  r9  r0   r  evaluation_loopr   r   r  r  r  ro   r~  r`  ra  r  r   r  r#  r  r  r  r  r!  on_evaluater,  r+  rD  )rE  r   r  r  overrider  eval_dataset_name_eval_datasetdataset_metricseval_dataloaderr  outputtotal_batch_sizes                r   r  zTrainer.evaluate	  s+   Z  t+'/|T5F5FlD)G4@4F4F4H 00!="&--2:@Q +):(;1=N<O&P #0 #
 /0 N 	""$22<@&&1/BOYY[
%%$ *.)=)=)E4#/ & 
  9944tyy7K7KK  78FNNJ&..,=+>>U)VWWJ!"..))F$6$69I$IJ		
 	 ((DIIOO;OOC..01,,88DJJPTP\P\^d^l^lm44V^^D~~rp  r  c           	         | j                   }||n|j                  }| j                  r| j                  t	        | dd      \  }}| j                  | j                  d|      }t        | j                  j                        dk(  r|| j                  u rt        j                         }	| j                  s;| j                  rJ| j                  j                  dk7  r1| j                   j                  s| j                  j                  |      n| j                  j                  |d      }t!        t        j                         |	z
  d	      | _        | j                  r|| _        || j                  ur|| _        | j                  r| j$                  | _        | j&                  so|j(                  r,|j+                  t,        j.                  |j0                  
      }n7|j2                  r+|j+                  t,        j4                  |j0                  
      }| j                   j6                  }
t8        j;                  d| d       t=        |      r(t8        j;                  d| j?                  |              nt8        j;                  d       t8        j;                  d|
        tA        |d      r%tC        |jD                        r|jE                          tA        | jF                  d      r9tC        | jF                  jD                        r| jF                  jE                          || jH                  _%        tM        |dd      }tO        | j                   jP                  d      }tO        | j                   jP                  d      }tO        | j                   jP                  d      }tO        | j                   jP                  d      }d}i }d}tS        |      D ]j  \  }}tU        |      }|	||z  }|
|}
| jW                  ||||      \  }}}tM        | j                  dd      }d|jX                  v r| j[                  ||         nd}t]               rt_        j`                          |1| jc                  |je                  |
            }|jg                  |       |[| j                  ji                  |dd      }| jc                  |      }| j                   jj                  r|dk(  r|jg                  |       || j                  ji                  |dd      }|y| j                  ji                  |dd      }| jl                  | jm                  ||      }| jc                  |      }| j                   jj                  r|dk(  r|jg                  |       |=| jc                  |      }| j                   jj                  r|dk(  r|jg                  |       | jH                  jo                  || jp                  | jr                        | _9        | j                   jj                  r| jt                  o|m|k| j                  jv                  jx                  }i }d|jX                  v r|nd|d<   d|jX                  v r|nd|d<   | ju                  t{        d%||d||      }~~~~t,        j|                  j                          |j                  |dz   |j                  z  dk(  s	|j                          |j                          |j                          |j                          ~~~~t,        j|                  j                          m | j                  j                  | _1        |j                         }|j                         }|j                         }|j                         }t=        |      rt        |      }nLt        |t              rtM        |dd      dkD  r|j>                  }nt=        |      r| j?                  |      }n|}|dk(  r|dkD  r|}| jt                  b|`|^| j                   jj                  sHd|jX                  v r|nd|d<   d|jX                  v r|nd|d<   | ju                  t{        d%||d|      }n|i }t        |      }t        |t              r:|r8t        j                  |      j                         j                         || d <   n>t        |t        j                        r$|j                         j                         || d <   tA        | d!      r| j"                  || d"<   t        |j                               D ]0  }|j                  | d#      r|j                  |      || d#| <   2 t        ||||$      S )&z
        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.

        Works both with or without labels.
        Nr   T)r  	inferenceF)r  r  ri  )evaluation_mode   )rz  r   z
***** Running z *****rn  z  Num examples: Unknownz  Batch size = r  r  r8  )padding_indexr  ru  rv  r  r   )r;  	pad_index
Predictionr  losses)predictionsr  )rU  r  _lossmodel_preparation_timer  r  )r  r  r  r~  r  )Qr   r  r   r   r$   r  r   r   r  _modelsr  r  rr  torch_compiler  prepare_modelroundr  r  r-  r  r	  r   float16r   r  bfloat16r  r   r   rh   r  r	  re  r  r  r!  r  r   rF   eval_do_concat_batchesr  rK   prediction_stepinclude_for_metricsrB  r   r  r  r  repeataddpad_across_processesr^  r   on_prediction_stepr,  r+  r   r  end_of_dataloaderrX   cudaempty_cacheeval_accumulation_stepsto_cpu_and_numpyr  
get_arraysr  rG   rd   r  r@  concatenater  rL  ndarrayr  
startswithr  rW   )rE  r  r  r  r  r  r   r  r   r  r  r   
all_losses	all_preds
all_labels
all_inputsr  eval_set_kwargsobserved_num_examplesr  r  observed_batch_sizer  logitsr  ru  inputs_decodeis_last_stepbatch_kwargsr~  keys                                  r   r  zTrainer.evaluation_loop*
  s    yy7K7W3]a]v]v $$)?!$1MDAq  e
 St''(A-%4::2EJ ,,((T-=-=-M-MQV-V_c_h_h_v_v   ((/ %%33E43P	  +0		j0H!*LD'##"
 DJJ&%*" ((!%!3!3 ""u}}T[[I$$u~~dkkJYY..
&{m6:;j!KK+D,=,=j,I+JKLKK12oj\235&!huzz&:JJL4>>6*x8K8K/LNN!0:-z9d; 'tyy'G'GW[\
%dii&F&FVZ[	&tyy'G'GW[\
&tyy'G'GW[\
 !" &j1 C	)LD&"1&"9".%)<<%%!4J &*%9%9%I]kv%9%w"FFF%djj2C[QO@HDLdLd@d##F?$;<jn  &' !--fmmJ.GHv&( $ 0 0 E EmYZfj E k $ 4 4] Cyy33{l7RNN=1!))>>v1X\>]!))>>v1X\>]55A!??OF--f5yy33{l7RMM&)!--f5yy33{l7RNN6*00CCD$**VZVbVbcDLyy++''38JvOa#'#3#3#B#B#T#TL#%L7=AYAY7YV_cL*7?4C[C[7[VaeL*"22&\6V\|\'3 3 G
 FFF

&&( --9tax4KgKg>gkl>l++-**,++-++-FFF

&&(GC	)L  $//BB  **,
((*	**,
**,
 l#l+K &:;Vdfg@hkl@l&33K*%"//
;3! 5 9/K   ,%&II006<@X@X6X
^bOH%6>$BZBZ6Z
`dOH%**^9
^o^G _G (0j$'J35>>*3M3R3R3T3Y3Y3[G()/0
BJJ/3=??3D3I3I3KG()/0412EIE`E`G())@AB ' 	IC>>%6$7q"9:8?C8H,-Qse45	I )zSZhsttrp  c                 ,   | j                   j                          | j                  |      }t        j                         }| j	                  |d||      }| j
                  j                  | j
                  j                  z  }| d|j                  v r||j                  | d   z  }|j                  j                  t        |||j                  t        j                  |j                  |z                     | j                  j                  | j
                  | j                   | j"                  |j                        | _        | j                   j%                  |j                         t'        |j(                  |j*                  |j                        S )a  
        Run prediction and returns predictions and potential metrics.

        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
        will also return metrics, like in `evaluate()`.

        Args:
            test_dataset (`Dataset`):
                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
                `model.forward()` method are automatically removed. Has to implement the method `__len__`
            ignore_keys (`list[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "test_bleu" if the prefix is "test" (default)

        <Tip>

        If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
        in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
        one array. The padding index is -100.

        </Tip>

        Returns: *NamedTuple* A namedtuple with the following keys:

            - predictions (`np.ndarray`): The predictions on `test_dataset`.
            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
            - metrics (`dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
              labels).
        r  )r  r  r  r  r}  )r  r  r  )r   r   r  r  r  r   r  r  r  rz  ro   r~  r`  ra  r!  
on_predictr,  r+  rD  r[   r  r  )rE  r  r  r  test_dataloaderr  r  r  s           r   predictzTrainer.predict
  sR   H 	""$22<@YY[
%%;bs & 
  9944tyy7K7KK  78FNNJ&..,=+>>U)VWWJ!"..))F$6$69I$IJ		
 ,,77		4::t||]c]k]kl44V^^DF,>,>&JZJZdjdrdrssrp  c                    t        | j                        dk(  rdnt        fd| j                  D              }j                  d      }|| j                  }t        | j                        dk(  xr |}| j                        ;t        | j                  d      r#t        | j                  j                  ddg      ng |s|r;t        t        fd	| j                  D                    }t        |      d
k(  r|d   }nd}t        j                         5  t               rt        |      }	|s|r{t!        |	t"              r(|	d   }
t        fd|	j%                         D              }n
|	d   }
|	d
d }|
j'                         j)                         j+                         }t-        |      }nOd}t!        |	t"              r#t        fd|	j%                         D              }n|	}t-        |      }n|s|r| j/                         5  | j1                  g| j2                  j4                        }| j7                  |d|      \  }}ddd       j)                         j9                         }t!        t"              r#t        fd|j%                         D              }n^|d
d }nXd}| j/                         5   |di }ddd       t!        t"              r#t        fd|j%                         D              }n|}ddd       |rddfS t              }t        |      d
k(  r|d   }||fS # 1 sw Y   xY w# 1 sw Y   |xY w# 1 sw Y   KxY w)a   
        Perform an evaluation step on `model` using `inputs`.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`dict[str, torch.Tensor | Any]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
            ignore_keys (`list[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.

        Return:
            tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
            logits and labels (each being optional).
        r   Fc              3   D   K   | ]  }j                  |      d u  y wr   r8  )r   r   r  s     r   r   z*Trainer.prediction_step.<locals>.<genexpr>U  s      Av`a&**Q-W[B[Avs    return_lossNr   keys_to_ignore_at_inferencepast_key_valuesc              3   @   K   | ]  }j                  |        y wr   r  )r   namer  s     r   r   z*Trainer.prediction_step.<locals>.<genexpr>g  s     (WdD)9(WrD  r   r  c              3   :   K   | ]  \  }}|d gz   vs|  ywr  Nr  r   r   rC  r  s      r   r   z*Trainer.prediction_step.<locals>.<genexpr>s  s&     )o1qXcgmfnXnOn!)o   c              3   2   K   | ]  \  }}|vs|  y wr   r  r  s      r   r   z*Trainer.prediction_step.<locals>.<genexpr>}  s     )d1qXcOc!)d   T)r  r  c              3   :   K   | ]  \  }}|d gz   vs|  ywr  r  r  s      r   r   z*Trainer.prediction_step.<locals>.<genexpr>  s&     &hTQQ\`f_gQgHgq&hr  c              3   2   K   | ]  \  }}|vs|  y wr   r  r  s      r   r   z*Trainer.prediction_step.<locals>.<genexpr>  s     &]TQQ\H\q&]r  r  )r   r  allr8  r   r  r	  r   r   r   rO   rF  r   no_gradr   r   r  r  rE  r  r  r   r   r
  r5  r   r   r  r  )rE  r   r  r  r  
has_labelsr  loss_without_labelsr  raw_outputsr  	logits_mbr  r  r  r  s     ` `           r   r  zTrainer.prediction_step7  s   < "$"2"23q8UcAveieueuAv>v
 jj/..K!$"2"23q8H[%%f-tzz8,%djj&7&79VYjXkl  ,"5(WdFVFV(W#WXF6{aF]]_ (	)&(.uf=!4!+t4"-f"5$))o8I8I8K)o$o	"-a.$/O	"..0779==?D.y9FD!+t4$))d8I8I8K)d$d	$/	.y9F!4::< -1-I-I6(TXT]T]TdTd-e*(,(9(9!6$Se ): )g
  ;;=--/D!'40!&&hW]]_&h!h!(D::< 2"'/&/2!'40!&&]W]]_&]!]!(Q(	)T  $%%v&v;!AYFff%%7 2 2G(	) (	)s?   C.MA L0A1M8	L<=M0L9	5M<M	MMrr  c           
         | j                  |      }| j                  || j                  j                  |       t	        | j
                  t        j                  j
                  j                        rO|sM| j                  j                  }|j                  d      sd| }	 | j
                  j                  ||          |S |S # t        $ r2}t        d| dt        |j                                d| d      |d}~ww xY w)zJRun evaluation, report to HP search, and step ReduceLROnPlateau if needed.r  eval_9The `metric_for_best_model` training argument is set to 'W', which is not found in the evaluation metrics. The available evaluation metrics are: zX. Please ensure that the `compute_metrics` function returns a dictionary that includes 'zM' or consider changing the `metric_for_best_model` via the TrainingArguments.N)r  _report_to_hp_searchr,  r  r  r  r   r  rJ  r   rc  r  r  KeyErrorr  r  )rE  rQ  rS  rr  r  metric_to_checkexcs          r   r  zTrainer._evaluate  s    --,@-A!!%)?)?I d'')A)A)S)ST]k"ii==O"--g6$)/):";	!!&&w'?@ w  OP_O` a==A',,.=Q<R Smm|l} ~_` s   !C 	C>-C99C>c                 X   | j                   || j                   t        j                  k(  r|j                  }n| j                   t        j                  k(  r-ddl}|j                  j                         j                         }n7| j                   t        j                  k(  rddl
}|j                  j                  }| j                  | j                  |      nd }t        j                  j!                  | j"                  j$                  |      }|S | j"                  j$                  }|S )zIReturn the output directory, accounting for hyperparameter search trials.Nr   zrun-)r/  rY   OPTUNAnumberRAYray.tunetuneget_contextget_trial_idWANDBwandbrunidr.  r)  r[  r  r   r   )rE  rQ  run_idrayr  run_namer  s          r   r  zTrainer._get_output_dir  s    !!-%2C%%)?)??''?+>+>>--/<<>''?+@+@@.2ll.Ft||E*dSYRZOHggll499#7#7BG  ii**Grp  c                 N   t          d| j                  j                   }| j                  || j	                          | j                  |      }t        j                  j                  ||      }| j                  |d       | j                  j                  t        j                  t        j                  fv r| j                  j                  rt!               rt#        j$                  d       nZ| j                  j&                  t(        j*                  k(  rt-        j.                          nt1               rt3        j.                          t          d| j                  j                   }t        j                  j                  ||      }t        j                  j5                  |      r|| j                  _        | j                  j8                  s3| j;                  |       | j=                  |       | j?                  |       | j                  j@                  r| jB                  jD                  | jF                  gz   D cg c]  }tI        |tJ              s| c}D ]  }|jL                  jN                  }	|j                         }
tI        | j                  jP                  |	   tR              r)| j                  jP                  |	   jU                  |
       y|
| j                  jP                  |	<    | j                  jW                  t        j                  j                  |tX                     | j                  jZ                  r| j]                  |       | j                  j@                  r8t_        || j                  j`                  | j                  j6                  d       yyc c}w )zSSave model checkpoint, optimizer, scheduler, scaler, RNG states, and trainer state.-N)rQ  T_internal_callrb  )r   r  r  	use_mtime)1rU   r,  r  r/  r  r  r)  r[  r  
save_modelr   r`  r]   STEPSEPOCHbest_global_stepr   r  
rendezvousr   rs   r   r  barrierr   r[  existsr  r  _save_optimizer_and_scheduler_save_scaler_save_rng_stater(  r!  r   r+  r  r<   r   r   r   r  r2  save_to_jsonr\  r&  _push_from_checkpointrk   r  )rE  r   rQ  checkpoint_folderr  r   best_checkpoint_folderbest_checkpoint_dirrP  cb_namecb_states              r   r,  zTrainer._save_checkpoint  s     55Qtzz7M7M6NO!!)emOO&&U&3WW\\'+<=

4899""|'9'9<;M;M&NNSWS]S]SnSn &'67((L,D,DD(*(='>a

@[@[?\%]""$'',,w8N"Oww~~123F

0yy((..z:j)  , 99   "22<<~MQ[\^`oQp F ,,//88:djj;;GDdKJJ11':AA(K=EDJJ11':F JJ##BGGLL=O$PQ99  &&z2 99  "!%!;!;&*jj&F&F	 !s   >N"N"r  c           
      6   d}| j                   j                  D| j                   j                  }|j                  d      sd| }	 ||   }| j                   j                  rt        j                  nt        j                  }| j                  j                  ;| j                   j                  rt        d      n
t        d      | j                  _         ||| j                  j                        rn|| j                  _        | j                   j                  t        j                  t        j                   fv r%| j                  j"                  | j                  _        d	}|S # t        $ r/}t        d| dt	        |j                                d      |d}~ww xY w)
z
        Determine if the model should be saved based on the evaluation metrics.

        Returns:
            bool: True if a new best metric was found, else False
        FNr  r  r  zJ. Consider changing the `metric_for_best_model` via the TrainingArguments.z-infinfT)r   rc  r  r  r  r  greater_is_betterr@  greaterlessr,  best_metricr  r`  r]   r  r  r  r  )rE  r  rQ  r0  r  metric_valuer  operators           r   r+  zTrainer._determine_best_metric  s[    #99**6"ii==O"--g6$)/):";&7 &*YY%@%@rzzbggHzz%%-:>)):U:Uv[`af[g

&djj&<&<=)5

&99**|/A/A<CUCU.VV26**2H2HDJJ/%)"!!'  OP_O` a==A',,.=Q<R  S]^ s   E   	F)*FFr   c           	         t        j                         t        j                   j                         t        j                   j                         d}t        j                  j                         r~| j                  j                  t        j                  k(  r,t        j                  j                   j                         |d<   n+t        j                  j                   j                         |d<   t               rt        j
                         |d<   t               r~| j                  j                  t        j                  k(  r,t        j                   j                   j                         |d<   n+t        j                   j                   j                         |d<   t#               r~| j                  j                  t        j                  k(  r,t        j$                  j                   j                         |d<   n+t        j$                  j                   j                         |d<   t'               r~| j                  j                  t        j                  k(  r,t        j(                  j                   j                         |d<   n+t        j(                  j                   j                         |d<   t+               rj| j                  j                  t        j                  k(  r"t        j,                  j                         |d<   n!t        j,                  j                         |d<   t/        j0                  |d	       | j                  j2                  d
k  r5t	        j4                  |t.        j6                  j9                  |d             yt	        j4                  |t.        j6                  j9                  |d| j                  j:                   d             y)z@Save random number generator states for reproducible resumption.)pythonnumpyr   r  r   npuhpumlumusaTr   r   rng_state.pth
rng_state_.pthN)randomgetstater@  	get_stater   get_rng_stater  r  r   r   rs   r   get_rng_state_allr   r  r   r  r   r  r   r  r   r  r)  r*  r  saver[  r  r  )rE  r   
rng_statess      r   r  zTrainer._save_rng_state8  sw    oo'YY((*<<--/


 ::""$yy&&,*B*BB%*ZZ%6%6%H%H%J
6"%*ZZ%6%6%D%D%F
6"!# " 0 0 2Ju!#yy&&,*B*BB$)II$4$4$F$F$H
5!$)II$4$4$B$B$D
5!!#yy&&,*B*BB$)II$4$4$F$F$H
5!$)II$4$4$B$B$D
5!!#yy&&,*B*BB$)II$4$4$F$F$H
5!$)II$4$4$B$B$D
5!"$yy&&,*B*BB%*ZZ%A%A%C
6"%*ZZ%=%=%?
6" 	J.991$JJz277<<
O#LMJJz277<<
jI`I`Haae<f#ghrp  c                 
   t               rt        j                  d       | j                  r| j                  j                         | j                  j                         d}t        j                  |t        j                  j                  |d| j                  j                   d| j                  j                   dt               d       nPt        j                  | j                  j                         t        j                  j                  |t                     t!        j"                  d	      5 }t        j                  | j$                  j                         t        j                  j                  |t&                     t)        |       d
d
d
       nt+               r| j                  j-                  d      }t/        j0                          t/        j2                         dk(  s%t.        j4                  j6                  j8                  rt/        j                  |t        j                  j                  |t              dt.        j4                  j6                  j8                         n| j:                  rdt=        t?        j@                  | jB                  jD                        jF                  jI                               v }|r4tK        | j                        r| jB                  jE                  |d       n$| jB                  jE                  |       n| jL                  rtO        | jP                  j4                  jR                  | jP                  | j                  |fi tU                tW        | jP                  j4                  jR                  | jP                  | j                  | j                  |       nf| j                  jX                  rPt[        j                  | j                  j                         t        j                  j                  |t                     | j:                  xr t]        | j$                  t^               }| j                  jX                  r| j:                  r|rt               s{t!        j"                  d	      5 }t[        j                  | j$                  j                         t        j                  j                  |t&                     d
d
d
       t)               y
y
y
y
# 1 sw Y   xY w# 1 sw Y   $xY w)zBSave optimizer and learning rate scheduler states to `output_dir`.saving_optimizer_states)r  shard_metadatar  -of-r  F)master_onlyTrecordN)gather_if_shardr   )r   v3exclude_frozen_parametersr.  )0r   r  r  rC  r  
state_dictr   get_shard_metadatar#  r)  r[  r  r   r  r  OPTIMIZER_NAMEr  catch_warningsr  SCHEDULER_NAMErQ   r   local_state_dictr[  r   rdp_rankr,  r\  shard_optimizer_stater   r   r   r  r  save_checkpointr  r  r`   r  r   r  r  r)   r   r(  r   r  r   )rE  r   optmcaught_warningsopt_state_dict accept_exclude_frozen_parametersis_deepspeed_custom_schedulers          r   r  z%Trainer._save_optimizer_and_schedulerk  s   !#MM34**!%!:!:!<&*jj&C&C&E GGLL"d499+B+B*C4		H\H\G]]^_m^n$o !& 113RWW\\*n5]^((5 5))446ZQ_8`a#O45 5 %&!^^<<U<SNKKM||~"ciimm&I&I"GGLL^< yy}}::	 && 0Kc!!$"4"4"D"DEPPUUWO 0, 0N4::4N""22:Y]2^""22:>!!  &&22D4D4DdjjR\`t`v    &&22D4D4DdnnVZV`V`bl YY""JJt~~002BGGLL^4\] )-(A(A )
*8K
 G
% II!!..2O*,((5 e

4,,779277<<
Tb;cde0	 - 3P "Q5 5Xe es   4AS6AT6S?Tc                    	 | j                   j                  }|yt               rt	        j
                  d       t        j                  d      5 }t	        j                  | j                   j                  j                         t        j                  j                  |t                     t        |       ddd       | j                  j                   rt               st        j                  d      5 }t#        j                  | j                   j                  j                         t        j                  j                  |t                     ddd       t               yyy# t        $ r Y yw xY w# 1 sw Y   xY w# 1 sw Y   2xY w)z-Save the gradient scaler state if one exists.Nsaving_scaler_stateTr*  )r  scalerr  r   r  r  r  r3  r#  r0  r)  r[  r  SCALER_NAMErQ   r   r(  r   )rE  r   r@  r:  s       r   r  zTrainer._save_scaler  s+   	%%,,F >!#MM/0((5 5((//::<bggll:Wb>cd#O45
 99  )?)A((5 h

4++22==?jZeAfgh0 *B   		5 5h hs+   E% A&E44AF %	E10E14E= F	c           
         || j                   }t        j                  j                  t              }t        j                  j                  t
              }t        j                  j                  t              }t        j                  j                  t              }t        j                  j                  t              }t        j                  j                  t              }t        j                  j                  t              }	t        j                  j                        xrm t        fdt        j                        D              xsD t        j                  j                  t        j                  j                  t         d            }
t        j                  j                        rt        j                        D cg c]  }t        j                  j                  t        j                  j                  |            rt        j                  j                  t        j                  j                  |t
                    sBt        j                  j                  t        j                  j                  |t                    r| c}ng }|
r| j                   st#        d d      t        d ||||	||fD              s|
s|st#        d       t$        j'                  d d	       t        j                  j                  |      rLt)        j*                  |      }|j,                  }|)|t.        k7  r t$        j1                  d
| dt.         d       t        j                  j                  |      s!t        j                  j                  |      s|
rt3               rt5        j6                  t        dd       y| j                   r@t9        | j:                  j<                  j>                  | j:                  |fi tA                yt        j                  j                  |      r"tB        jD                  jG                  |d      }n"tI                tE        jJ                  |dd      }|jM                  |d      }~| jO                  |       ytQ        |      rtS        |d      rtS        |d      rt        j                  jU                        r|jV                  }tY        |      dkD  rt$        j1                  d       |d   }|rP|D ]9  }t        j                  j                  |      }|j[                  ||||k(         ; |j]                  |       y|j[                  |d       yt$        j1                  dt
         d       yt$        j1                  dt^         d       yta        |t3                     }t3               s| jO                  |       yyc c}w )z/Load model weights from a checkpoint directory.Nc              3      K   | ]I  }t         j                  j                  t         j                  j                  |            r
t        |v  K y wr   )r)  r[  isdirr  FSDP_MODEL_NAME)r   folder_namerP  s     r   r   z0Trainer._load_from_checkpoint.<locals>.<genexpr>  s>      77==.Dk!RS  ;.s   AAz.binzCheckpoint found at z* is only supported when using PyTorch FSDPc              3   Z   K   | ]#  }t         j                  j                  |       % y wr   )r)  r[  r  )r   fs     r   r   z0Trainer._load_from_checkpoint.<locals>.<genexpr>  s%      
 q!
s   )+z!Can't find a valid checkpoint at zLoading model from r  z9You are resuming training from a checkpoint trained with z- of Transformers but your current version is zJ. This is not recommended and could yield to errors or unwanted behaviors.Fr[  tagr   load_optimizerr   rq  Tmap_locationweights_onlyactive_adaptersload_adapterr   zFMultiple active adapters detected will only consider the first adapterr   )is_trainablejThe intermediate checkpoints of PEFT may not be saved correctly, consider using a custom callback to save i in corresponding saving folders. Check some examples here: https://github.com/huggingface/peft/issues/968Could not load adapter model, make sure to have PEFT >= 
 installedstrict)1r   r)  r[  r  rx   rw   rv   r}   r|   r{   rz   rD  r  listdirr  rE  r  r   r   r   r   from_json_filetransformers_versionr   r]  r   r[  rP  r   r  r,  r  r)   safetensorsr   	load_filer   loadload_state_dict_issue_warnings_after_loadr`   r	  r  rO  r   rP  set_adapterr.   ri   )rE  rP  r   config_fileadapter_weights_fileadapter_safe_weights_fileweights_fileweights_index_filesafe_weights_filesafe_weights_index_fileis_fsdp_ckptrF  adapter_subdirsr   checkpoint_versionr0  load_resultrO  active_adaptersubdir_namepeft_ids    `                   r   rY  zTrainer._load_from_checkpoint  s   =JJEggll#9;G!ww||,BDXY$&GGLL1GIb$c!ww||$:LIWW\\*@BTUGGLL)?ARS"$'',,/EG^"_ww}}%;< 	
 #%::.D#E  ^ ww~~bggll+AoEVVZC[\] 	* ww}}34 $&::.D#E77==.Dk!RSGGNN277<<0FUi#jkww~~bggll3I;Xq&rs   	  4 434J3KKuvww  
 !%&+(-
 
 @AW@XYZZ)*@)ACD77>>+&%44[AF!'!<!<!-2D2SOPbOc d@@K} M== 77>>,'277>>:K+LP\&(**/\5af %%$$**66$$*	
 +, 77>>"34!,!2!2!<!<=NW\!<!]J,.!&Lu[_!`J $33JF//< E"u/0WUN5S77>>"89&+&;&;O?+a/'op%4Q%7N&+: sK&(ggll3I;&WG!..wS^bpSp.rs )).9**+A>`d*eNNDDXCY Zbb !YZjYkkuvw 2%9OXoXqrK*,//< -Ms   C
W3c                    t         j                  d| j                  j                   d| j                  j                   d       t
        j                  j                  | j                  j                  t              }t
        j                  j                  | j                  j                  t              }t
        j                  j                  | j                  j                  t              }t
        j                  j                  | j                  j                  t              }t               r| j                  n| j                  }| j                  rAt!        | j                  | j                  j                  t#        | j                                y| j$                  rTt'        | j(                  j                  j*                  | j(                  || j                  j                  fi t-               }yt
        j                  j/                  |      s^t
        j                  j/                  |      s?t
        j                  j/                  |      s t
        j                  j/                  |      rd}t               r2t1        j2                  | j                  j                  t        dd       yt#        |      r	t5        |d      rt5        |d	      r|j6                  d
   }t9        |j6                        dkD  rt         j;                  d       t
        j                  j/                  |      st
        j                  j/                  |      r7	 |j=                  | j                  j                  |       d
dl$m%}  |g g       }nt         j;                  dt         d       d}nt         j;                  dtL         d       d}nut
        j                  jO                  |      r"tP        jR                  jU                  |d      }n"tW                tS        jX                  |dd      }|j[                  |d      }t               s|r| j]                         yyyt
        j                  j/                  t
        j                  j                  | j                  j                  t^                    sUt
        j                  j/                  t
        j                  j                  | j                  j                  t`                    rGtc        || j                  j                  t                     }t               s| j]                  |       yyt         j;                  d| d       y# t>        $ rR}	|j@                  |   jB                  r3d|j@                  |   jD                  jF                   d}
t?        |
      |	 d}	~	ww xY w)zFLoad the best model found during training based on the tracked metric.zLoading best model from z	 (score: z).rl  TFrI  rO  rP  r   r   zCDetected multiple active adapters, will only consider the first onez0When using prompt learning PEFT methods such as z, setting load_best_model_at_end=True can lead to errors, it is recommended to set this to False and to load the model manually from the checkpoint directory using PeftModel.from_pretrained(base_model, <path>) after training has finished.N)_IncompatibleKeysrR  rS  rT  rU  r   rq  rL  rV  z#Could not locate the best model at zi, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.)2r   r   r,  r  r  r)  r[  r  r}   r{   rw   rv   r   r  r   r   r%   r`   r  r   r  r  r)   r  r[  rP  r	  rO  r   r]  rP  r   peft_configis_prompt_learning	peft_typerM  torch.nn.modules.modulerp  r.   r  r[  r   r\  r   r]  r^  r_  rz   r|   ri   )rE  best_model_pathbest_safe_model_pathbest_adapter_model_pathbest_safe_adapter_model_pathr   rk  has_been_loadedrl  r  msgrp  r0  s                r   r  zTrainer._load_best_modelE  sY   .tzz/O/O.PPYZ^ZdZdZpZpYqqstu'',,tzz'G'GV!ww||DJJ,L,LN_`"$'',,tzz/O/OQe"f')ww||DJJ4T4TVo'p$&=&?""TZZ$$%""

00'5djj'A#A
 !!)  &&22  

00	
 '(K GGNN?+ww~~23ww~~56ww~~:;"O&(**99$!#(	 "%(u&78WUN=[).)>)>q)Au4459"NN+pq77>>*ABbggnnUqFr* % 2 24::3S3SUc d  R*;B*CK"NN!LL`Ka bj!j
 /4OVWgVhhrs +0 ww~~&:;%0%6%6%@%@AU^c%@%d
02%*ZZebf%g

 #("7"7
E"JK.0_33K@ 6E0WW^^BGGLL)I)IKbcdhjhohohvhvGGLL99;MNi
 2tzz77@W@YK +,//< - NN5o5F GP Pk $0 *#(#4#4^#D#W#W )[+0+<+<^+L+V+V+\+\*] ^8)8 %( +7s*;$D$)*s   &U$ $	V?-AV::V?r  c                 >   |y| j                   j                  dkD  rs| j                   j                  }t        j                  j                  |d| d      }t        j                  j                  |      sot        j                  d| d       yt        j                  j                  |d      }t        j                  j                  |      st        j                  d       yt               5  t                t        j                  |d	
      }ddd       t        j                  d          t        j                  j!                  |d          t        j                  j#                  |d          t%               rt'        j"                  |d          | j                   j(                  t*        j,                  k(  }t        j.                  j1                         rt3        dt        j.                  ||       t5               rt3        dt        j6                  ||       t9               rt3        dt        j:                  ||       t=               rt3        dt        j>                  ||       tA               rt3        dt        jB                  ||       yy# 1 sw Y   xY w)z9Restore random number generator states from a checkpoint.Nr   r  r  z$Didn't find an RNG file for process zr, if you are resuming a training that wasn't launched in a distributed fashion, reproducibility is not guaranteed.r  zDidn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.TrN  r  r  r   r   CUDANPUHPUMLUMUSA)"r   r  r  r)  r[  r  r  r   r   rS   r   r   r]  r  setstater@  	set_stateset_rng_stater   r  r   rs   r   r  r  rT   r   r  r   r  r   r  r   r  )rE  r  r  rng_filecheckpoint_rng_stateis_distributeds         r   r  zTrainer._load_rng_state  s    99!# II33Mww||J*]O40PQH77>>(+:=/ Jc c ww||J@H77>>(+B ^ 	K$&#(::hT#J 	K 	,X67
		09:""#7#>?!#1%8900L4L4LL::""$$VUZZ9M~^!#$UEII7K^\!#$UEII7K^\!#$UEII7K^\"$$VUZZ9M~^ %%	K 	Ks   /"JJc                     y| j                   rt        | j                  t              st	        j
                  d      5 }t                | j                  j                  t        j                  t        j                  j                  t              d             ddd       t               yt               r:t!        j                   t        j                  j                  t"              dz         nt        j                  j%                  t        j                  j                  t"                    xs t        j                  j%                  t        j                  j                  t&                    xsH t        j                  j)                        xr' t+        fdt        j,                        D              }| j.                  rQt!        j                   t        j                  j                  d| j0                  j2                   dt"                     n|}|rt        j                  j%                  t        j                  j                  t                    rpt5               r| j.                  r{t                t        j                  t        j                  j                  d	| j0                  j6                   d
| j0                  j2                   dt"               dd      }|d   }nDt                t        j                  t        j                  j                  t"              dd      }t	        j
                  d      5 }t                t        j                  t        j                  j                  t              dd      }ddd       t               t9        j:                  || j0                  j<                         t9        j:                  | j0                  j<                         | j>                  j                  |       | j                  j                  |       yt               r!fd}| j@                  jC                  |       n| j0                  j2                  dkD  r| j0                  j<                  nd}| jD                  rUtG        | jH                  jJ                  jL                  | jH                  | j>                  | jN                  fi tQ                n]t                | j>                  j                  t        j                  t        j                  j                  t"              |d             t	        j
                  d      5 }t                | j                  j                  t        j                  t        j                  j                  t              d             ddd       t               yyy# 1 sw Y   >xY w# 1 sw Y   PxY w# 1 sw Y   1xY w)z3If optimizer and scheduler states exist, load them.NTr*  r|  _*c              3      K   | ][  }t         j                  j                  t         j                  j                  |            rt        j                  d       d   |v  ] yw)r  r   N)r)  r[  rD  r  OPTIMIZER_NAME_BINsplit)r   rF  r  s     r   r   z8Trainer._load_optimizer_and_scheduler.<locals>.<genexpr>  sM      '77==j+)NO +005a8KGs   A!A$z	rank*-of-r  r  r(  r   rL  r  c                     |j                  t        j                  t        j                  j                  t              d             y )NTr   )r^  r[  r]  r)  r[  r  r2  )modoptr  s     r   opt_load_hookz<Trainer._load_optimizer_and_scheduler.<locals>.opt_load_hook$  s,    ++CHHRWW\\*n5]gk,lmrp  r   ))r   r  r  r   r  r3  r   r^  r   r]  r)  r[  r  r4  rQ   r   globr2  r  r  rD  r  rX  rC  r   r  r   r  r  send_cpu_data_to_devicer   r  r  register_post_step_hookr  r   r  r,  r  r   r)   )rE  r  r:  checkpoint_file_existsoptimizer_statelr_scheduler_stater  rM  s    `      r   r  z%Trainer._load_optimizer_and_scheduler  sP   $$d//1JK,,D9 _,.%%55

277<<
N#KZ^_
 $O4 '( IIbggll:~>EF rww||JGH 	77>>"'',,z;M"NO	 GGMM*-  +-::j+A  	$ ** IIbggll:499;O;O:PPQR`Qa/bcd' 	
 "bggnnRWW\\*n5]&^%'..,.&+jj&$tyy/F/F.GtDIIL`L`Kaabcqbr(s &+%)'O '6k&BO,.&+jjZ@ucg'O ,,D9 _,.).Z@ucg*&
 $O4**?DII<L<LM**+=tyy?O?OP..?!!112DE*,n &&>>}M
 8<yy7K7Ka7O499#3#3UZL+++ ,,22>> ,, NN JJ& 34 1266!JJ "Z HWcrv
 ,,D9 _,.%%55

277<<
N#KZ^_
 $O4C '_!; b P s'   AWAW''AW4W$'W14W=c           	      t   |yt         j                  j                  t         j                  j                  |t                    }|rYt               rt        j                  d      5 }t                t        j                  t         j                  j                  |t              dd      }ddd       t               t        j                  | j                  j                         | j                   j"                  j%                  |       yt        j                  d      5 }t                | j                   j"                  j%                  t        j                  t         j                  j                  |t              d             ddd       t               yy# 1 sw Y   xY w# 1 sw Y   "xY w)z If scaler state exists, load it.NTr*  r   rL  r|  )r)  r[  r  r  rA  r   r  r3  r   r   r]  rQ   r  r  r   r   r  r@  r^  )rE  r  r  r:  scaler_states        r   r  zTrainer._load_scalerD  s8   !#Z0U!V! &',,D9 _,.#(::Z=E`d$L
 $O4**<9I9IJ  ''77E,,D9 _,.$$++;;

277<<
K#HW[\
 $O4% "  s   (AF"&A'F."F+.F7c                     | j                   j                  syg }g }| j                  j                  | j                  gz   }| j
                  j                  j                         D ]9  \  }t        |t              s|g}t        fd|D              r|D cg c]  }|j                  j                  k(  s|  }}t        ||      D ]  \  }}|j                  di       }|j                  di       }	 t        |      d
i |}
|	j                         D ]  \  }}t!        |
||        t        |t"              r|
| _        n|j%                  |
       | j                  j'                  t        |
              t(        j+                  d       )|j%                         < t-        |      dkD  r(t(        j/                  ddj1                  |       d	       |D ]  }| j                  j3                  |        yc c}w )zLIf callback states exist and were passed in, restore their states if enabledNc              3   P   K   | ]  }|j                   j                  k(    y wr   )r   r   )r   callbackstored_callbacks     r   r   z/Trainer._load_callback_state.<locals>.<genexpr>j  s"     eh8%%../Aes   #&r   
attributeszPContinuing training from checkpoint, restoring any callbacks that were passed inr   zPCheckpoint included callbacks not included in current configuration. Ignoring. (r  rU  r  )r   'restore_callback_states_from_checkpointr!  r   r+  r,  r   rE  r  r  r  r   r   zipr8  r  r  r@   r2  remove_callbackr   r   r   r]  r  r"  )rE  	not_foundnew_callbacksoriginal_callbacksrj  r  
duplicatescallback_datar   r  new_callback	attributerM  r  s                @r   r  zTrainer._load_callback_state_  s   yy@@	!22<<~M%)ZZ%B%B%H%H%J 	2!OTdD)veRdee .@!)8CUCUC^C^bqCqH
  03:t/D N+Hm(,,VR8D!.!2!2<!DJ#14>#9D#9L,6,<,<,> @(	5i?@!(N;'3%,,\:))99$|:LMN no  1/	20 y>ANNbcgclclmvcwbxxyz & 	9H!!..x8	9/s   G;6G;rk  c                    t        |j                        dk7  r| j                  j                  Pt	        |j                        t	        | j                  j                        k(  r| j                  j                          n#t        j                  d|j                   d       t        |j                        dk7  r$t        j                  d|j                   d       yy)zGLog warnings for missing or unexpected keys after loading a checkpoint.r   Nz8There were missing keys in the checkpoint model loaded: r  z;There were unexpected keys in the checkpoint model loaded: )	r   missing_keysr   _keys_to_ignore_on_saver   tie_weightsr   r]  unexpected_keys)rE  rk  s     r   r_  z"Trainer._issue_warnings_after_load  s    {''(A-zz11=#kF^F^B_cf

22d C 

&&(!YZeZrZrYsstuv{**+q0NNMkNiNiMjjkl 1rp  r  c                 V   || j                   j                  }t               rDt        | j                  | j                   | j
                  | j                  | j                  |       not               rt        j                  |d       | j                  j                         }| j                   j                  r| j                  ||       t        t        j                   j#                  |d            j%                          n| j&                  rdt)        | j
                  j*                  j,                  j.                        v r| j
                  j1                  | j                        }| j                   j                  rT| j                  ||       n?| j2                  r	 dt5        t7        j8                  | j                  j:                        j<                  j?                               v }| j@                  jB                  jE                  di       jE                  d	d      d
k(  }|r4tG        | j                        r|r| j@                  jI                  d      }n%| j
                  j1                  | j@                        }| j                   j                  r| j                  ||       n'| j                   j                  r| j                  |       | j                   jV                  r+|s(| jW                  d| j                   jX                         yyy# tJ        $ r tL        jO                  d       | j                   j                  r| j                  |i        tQ        | j                   j                  |tR        tT        g       | j                  j;                  |       Y w xY w)z
        Will save the model, so you can reload it using `from_pretrained()`.

        Will only save from the main process.
        NTr   r0  zuser_content.ptFULL_STATE_DICTr.  zero_optimizationstager  r/  z| stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use zero_to_fp32.py to recover weightsz
Model save)commit_messagerevision)-r   r   r   r/   r   r  r   rC  r   r)  r*  r  r0  r(  _saver   r[  r  touchr  r  r,  r  r  get_state_dictr   r   r   r  r8  r  r  r   r   r8  r`   $_zero3_consolidated_16bit_state_dictr   r   r]  rR   r}   r{   r&  hub_revision)rE  r   r  r0  r<  zero3_shardings         r   r  zTrainer.save_model  s    --J!#

DIIt'7'79N9NPTPkPkmw %&KK
T2++668Jyy$$

:*
=j*;<=CCE!! C(8(8(>(>(J(J(Z(Z$[[!--<<TZZH
99((JJzjJA&&?3NRU%%d&8&8&H&HITTYY[S 40 "&!6!6!:!:;NPR!S!W!WX_ae!fjk!k3tzz8RWe "&!T!Tos!T!tJ!%!1!1!@!@!PJ99((JJzjJA YY""JJz" 99  L499CYCYZ *8   	?: 99((JJzbJ9'		(=(=zLZkKlm""22:>	?s   %D	L BN('N(r0  c                    ||n| j                   j                  }t        j                  |d       t        j                  d|        t               st        fnt        t        f}t        | j                  |      s|| j                  j                         }t        | j                  j                  | j                  d      |      r9| j                  j                  | j                  d      j                  ||       nyt        j                  d       t        j                   j#                  |t        j$                  j'                  |t(              d	d
i       n| j                  j                  ||       | j*                  | j*                  j                  |       nr| j,                  ft/        | j,                  d      rP| j,                  j0                  :t        j                  d       | j,                  j0                  j                  |       t!        j2                  | j                   t        j$                  j'                  |t4                     y)zHSave model weights, configuration, and processing class to `output_dir`.NTr   zSaving model checkpoint to Frv  r  zETrainer.model is not a `PreTrainedModel`, only saving its state dict.r  pt)metadata	tokenizerzWSaving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`)r   r   r)  r*  r   r   r   r3   r   r  r   r0  r  r4   save_pretrainedr[  r   	save_filer[  r  r{   r   r   r	  r  r#  TRAINING_ARGS_NAME)rE  r   r0  supported_classess       r   r  zTrainer._save  s    $.#9Ztyy?S?S

J.1*>?6G6I_.P_ajOk $**&78!!ZZ224
$**77

W\7]_pq  --djjU-Scc: d  cd!!++Z9J KW_aeVf ,  JJ&&zj&I  ,!!11*=***K8"",,8KKqr((88D 	

499bggll:7IJKrp  r.  c                 N   | j                   j                  | j                   j                  |d<   | j                  j                  dk7  r[| j                   j                  |d<   |@| j                   j                  | j
                  z
  }|j                  t        d||             i |d| j                   j                  i}| j                   j                  j                  |       | j                  j                  | j                  | j                   | j                  |      | _        y)a8  
        Log `logs` on the various objects watching training.

        Subclass and override this method to inject custom behavior.

        Args:
            logs (`dict[str, float]`):
                The values to log.
            start_time (`Optional[float]`):
                The start of training.
        Nr  rV  r  rc  )
num_tokensr  )r,  r  r   r  r  r  rz  ro   r  log_historyr2  r!  on_logr+  )rE  r.  r  current_session_num_tokensr  s        r   r  zTrainer.log  s     ::' JJ,,DM9922d:,0JJ,L,LD()%JJ44t7e7ee + M':Jdef9D9&$**"8"89

%%f-,,33DIItzz4<<Y]^rp  c                    | j                   j                  t        j                  k(  rm| j                  xj
                  t        | j                  g| j                   j                        j                         j                         z  c_        d| _        y| j                  xj
                  | j                  z  c_        d| _        y)zGStore the number of floating-point operations that went into the model.rq  r   N)r   r   rs   r   r,  r  rJ   r0  r   r  rL  r  s    r   r  zTrainer.store_flos  s    99""l&>&>>JJ!!-t/@/@.A$))JZJZ[__affh! !"DJJ!!T%6%66! !Drp  c                     t        | j                  dd      x}|v rIt        | j                  d      r3d||   j                         z  | j                  j	                  d      z  S y)a  
        For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
        operations for every backward + forward pass. If using another model, either implement such a method in the
        model or subclass and override this method.

        Args:
            inputs (`dict[str, torch.Tensor | Any]`):
                The inputs and targets of the model.

        Returns:
            `int`: The number of floating-point operations.
        ru  rv  num_parameters   T)exclude_embeddingsr   )r   r   r	  r!  r  )rE  r  
main_inputs      r   r  zTrainer.floating_point_ops  si     "$**.?MMJRXX]dJJ(^
 vj)//11DJJ4M4Mae4M4fffrp  tokenc                    | j                         sy| j                  j                  8t        | j                  j                        j                         j                  }n| j                  j                  }||n| j                  j                  }t        ||| j                  j                  d      }|j                  | _        d| _        y)zE
        Initializes a git repo in `self.args.hub_model_id`.
        NT)r  privater   )r   r   r%  r   r   absoluter  	hub_tokenr   hub_private_reporepo_idpush_in_progress)rE  r  	repo_namerepo_urls       r   r'  zTrainer.init_hf_repo3  s    
 ))+99!!)TYY112;;=BBI		..I*		0C0Cytyy?Y?Ydhi$,, $rp  languagelicensetagsr  finetuned_fromtasksdataset_tagsdataset_argsc
                 H   | j                         syt        j                  j                  | j                  j
                  d      }
d}t        j                  j                  |
      rt        j                  |
      j                  j                  d      }|dk(  }t        j                  |
      j                  j                  }|2|0t        |t              r|g}|D ]  }||vs|j                  |        t        j                   | |||||||||	
      }|j#                         }t%        |
d      5 }|j'                  |       ddd       |rI| j(                  j+                  | j,                        j/                  | j                  j
                         yy# 1 sw Y   UxY w)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            language (`str`, *optional*):
                The language of the model (if applicable)
            license (`str`, *optional*):
                The license of the model. Will default to the license of the pretrained model used, if the original
                model given to the `Trainer` comes from a repo on the Hub.
            tags (`str` or `list[str]`, *optional*):
                Some tags to be included in the metadata of the model card.
            model_name (`str`, *optional*):
                The name of the model.
            finetuned_from (`str`, *optional*):
                The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
                of the original model given to the `Trainer` (if it comes from the Hub).
            tasks (`str` or `list[str]`, *optional*):
                One or several task identifiers, to be included in the metadata of the model card.
            dataset_tags (`str` or `list[str]`, *optional*):
                One or several dataset tags, to be included in the metadata of the model card.
            dataset (`str` or `list[str]`, *optional*):
                One or several dataset identifiers, to be included in the metadata of the model card.
            dataset_args (`str` or `list[str]`, *optional*):
               One or several dataset arguments, to be included in the metadata of the model card.
        Nz	README.mdFlibrary_namepeft)	r  r  r  r  r  r  r  r  r  w)r   r)  r[  r  r   r   r  r   r]  rj  r8  r  r  r  r2  r2   from_trainerto_model_cardopenwriter  r4   r   create_or_update_model_card)rE  r  r  r  r  r  r  r  r  r  model_card_filepathis_peft_libraryr  existing_tagsrJ  training_summary
model_cardrH  s                     r   create_model_cardzTrainer.create_model_cardE  sw   J ))+ ggll499+?+?M77>>-.$>>*=>CCGGWL*f4O &NN+>?DDIIMM$=dC( 6D( )C$C() +77!)%%
 &335
%s+ 	 qGGJ	  ))$**5QQRVR[R[RfRfg 	  	 s   2FF!r  blockingr  c           
      $   | j                   j                  | j                  | j                  | j                         |j                  dd      }|~| j                  j                  rh| j                  j                  *t        | j                  j                        j                  }n(| j                  j                  j                  d      d   }||n| j                  j                  }| j                  | j                  |       | j                  d       | j                         syt!        | j"                  dd      \d	|vrg |d	<   t%        |d	   t&              r	|d	   g|d	<   | j"                  j(                  D ]  }||d	   vs|d	   j+                  |          | j,                  dd|i| || j                  j.                  }| j1                          t3        | j                  | j                  j                  ||| d
t4         dg|      S )u  
        Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`.

        Parameters:
            commit_message (`str`, *optional*, defaults to `"End of training"`):
                Message to commit while pushing.
            blocking (`bool`, *optional*, defaults to `True`):
                Whether the function should return only when the `git push` has finished.
            token (`str`, *optional*, defaults to `None`):
                Token with write permission to overwrite Trainer's original args.
            revision (`str`, *optional*):
                The git revision to commit from. Defaults to the head of the "main" branch.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to [`~Trainer.create_model_card`].

        Returns:
            The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
            progress of the commit if `blocking=True`.
        r  N/rs  )r  Tr  
model_tagsr  r  -*r  folder_pathr  r  run_as_futureignore_patternsr  r  )r!  on_push_beginr   r,  r+  r  r(  r%  r   r   r  r  r  r'  r  r   r   r   r  r  r  r2  r  r  r  r   rU   )rE  r  r  r  r  r  r  	model_tags           r   r&  zTrainer.push_to_hub  s   6 	++DIItzz4<<PZZd3
$))"7"7yy%%-!$))"6"67<<
!YY3399#>rB
*		0C0C $E* 	t, ))+
 4::|T2>V#!#v &.#."(.!1v!ZZ22 5	F6N26N)))45 	?*??yy--H 	!!#%%		,,)&,!&;%<B#?@
 	
rp  r  c           
      ~	   | j                         r'| j                  j                  t        j                  k(  ry| j                  j
                  s'| j                  | j                  j                         sy| j                  j                  | j                  | j                  | j                         | j                  j                  }t        t        t        t         g}t"        t$        fD ]  }t&        j(                  j+                  ||      }t&        j(                  j-                  |      sC|j/                  |       t1        |      5 }t3        j4                  |j7                               }ddd       t9        t;        d   j=                                     }|j?                  |        tA               r |j?                  tB        tD        tF        g       |D ]  }	t&        j(                  j-                  t&        j(                  j+                  ||	            sAtI        jJ                  t&        j(                  j+                  ||	      t&        j(                  j+                  ||	              | jL                  | jL                  jO                  |       tQ        jR                  | j                  t&        j(                  j+                  |tT                     | j                  jV                  tX        jZ                  k(  rd| j                  j\                   }
n"dt_        | j                  j`                         }
tc        | jd                  ||
| j                  jf                  ddth         dg| j                  jj                        }|g}| j                  j                  t        jl                  t        jn                  fv r| j                  j                  t        jl                  k(  rd	ntq        |      jr                  }tc        | jd                  |||
d
z   | j                  jf                  d| j                  jj                        }|j/                  |       | j                  | j                  j                         rtu        |      | _        y| j                  jv                  j?                  |       y# 1 sw Y   HxY w)zDPush model and checkpoint files to the Hub from a checkpoint folder.N
weight_mapzTraining in progress, step zTraining in progress, epoch Tr  r  r  zlast-checkpointz, checkpoint)r  r  path_in_repor  r  r  r  )<r   r   hub_strategyrZ   ENDhub_always_pushr  is_doner!  r  r,  r+  r   rx   ry   r}   r{   r|   rz   r)  r[  r  r  r2  r  jsonloadsreadr  r   r   extendr   ru   rw   rv   r  copyr   r  r   r#  r  r`  r]   r  r  r  r  r   r%  r  rU   r  
CHECKPOINTALL_CHECKPOINTSr   r  r   jobs)rE  r  r   modeling_files
index_file
index_pathrH  indexshard_filesmodeling_filer  model_push_job	push_jobsr  checkpoint_pushs                  r   r  zTrainer._push_from_checkpoint  s   ))+tyy/E/E/Xyy((T-B-B-NW[WlWlWtWtWv++DIItzz4<<PYY))
%'=|M^_-/FG 	3J&7DJww~~j)%%j1*% 1 JJqvvx0E1"3u\':'A'A'C#DE%%k2	3 !!#68LNg"hi+ 	uMww~~bggll+<mLMBGGLL):MJBGGLLYcerLst	u   ,!!11*=

499bggll:7IJK99""l&8&88:4::;Q;Q:RSN;C

@P@P<Q;RSN&%%")))%%!&;%<B#?@YY++
 $$	99!!k&<&<k>Y>Y%ZZ%)YY%;%;{?U?U%U![_`q[r[w[w  ,))-)->ii))"//O _-  (D,A,A,I,I,K$29$=D!!!&&--i8e1 1s   $R22R<	c                     t        | d      sy| j                  K| j                  j                         s0t        j	                  d       | j                  j                          yyy)z5Wait for any in-progress push to the Hub to complete.r  Nz\Waiting for the current checkpoint push to be finished, this might take a couple of minutes.)r	  r  r  r   r   wait_until_doner  s    r   r  zTrainer._finish_current_push&  sU    t/0  ,T5J5J5R5R5TKKvw!!113 6U,rp  hp_spacezoptuna.Trialcompute_objectiven_trials	directionrw  r.  c                 8   |
t               }t        |      }t        |          }|j                          || _        | j
                  t        d      ||j                  n|| _        || _	        |t        n|| _         |j                  | ||fi |}	d| _        |	S )a  
        Launch an hyperparameter search using `optuna` or `Ray Tune`. The optimized quantity is determined
        by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
        the sum of all metrics otherwise.

        <Tip warning={true}>

        To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to
        reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to
        subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom
        optimizer/scheduler.

        </Tip>

        Args:
            hp_space (`Callable[["optuna.Trial"], dict[str, float]]`, *optional*):
                A function that defines the hyperparameter search space. Will default to
                [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`]
                depending on your backend.
            compute_objective (`Callable[[dict[str, float]], float]`, *optional*):
                A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
                method. Will default to [`~trainer_utils.default_compute_objective`].
            n_trials (`int`, *optional*, defaults to 100):
                The number of trial runs to test.
            direction (`str` or `list[str]`, *optional*, defaults to `"minimize"`):
                If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you
                should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or
                several metrics. If it's multi objectives optimization, direction is `list[str]`, can be List of
                `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss,
                `"maximize"` when optimizing one or several metrics.
            backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
                The backend to use for hyperparameter search. Will default to optuna or Ray Tune, depending
                on which one is installed. If all are installed, will default to optuna.
            hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
                A function that defines the trial/run name. Will default to None.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments for each backend:

                - `optuna`: parameters from
                  [optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
                  and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from
                  [optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize)
                - `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run).
                  If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available).
                  If `progress_reporter` is not set in the `kwargs`,
                  [ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used.
        Returns:
            [`trainer_utils.BestRun` or `list[trainer_utils.BestRun]`]: All the information about the best run or best
            runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray
            backend.
        NzXTo use hyperparameter search, you need to pass your model through a model_init function.)r"   rY   r!   ensure_availabler/  r   r   default_hp_spacer  r.  rc   r  r  )
rE  r  r  r  r  rw  r.  r  backend_objbest_runs
             r   hyperparameter_searchzTrainer.hyperparameter_search0  s    z ?/1G!'*8AC$$&!(??"j  9A8H44h>O>W!:]n";??49GG!%rp  c                     t        | j                        }|dk(  r| j                         }n"|dk(  r| j                  |      }nt        d      |t        d      |S )zdInvoke `model_init` to get a fresh model instance, optionally conditioned on a hyperparameter trial.r   r   z'model_init should have 0 or 1 argument.z"model_init should not return None.)rj   r   r   )rE  rQ  model_init_argcountr   s       r   r   zTrainer.call_model_init  s]    1$//B!#OO%E A%OOE*EHII=CDDrp  c                 ,   || _         | j                  |y| j                  t        j                  k(  r| j	                  |      }nQ| j                  t        j
                  k(  r|}|j                  dd       n| j                  t        j                  k(  r|}j                         D ]v  \  }}t        | j                  |      st        j                  d| d       6t        | j                  |d      }| t        |      |      }t        | j                  ||       x | j                  t        j                  k(  r"t        j!                  d|j"                          | j                  t        j                  k(  rt        j!                  d|        | j$                  r| j                  j&                  t)        d      | j*                  j-                          dd	lm} dd
lm}  || j                  j&                        | j                  _        | j                  j6                  j9                  | j                          || j                  j6                        | j                  _        t=               j?                          | jA                          y)zRSet up training arguments and accelerator state for a hyperparameter search trial.Nr  zTrying to set zY in the hyperparameter search but there is no corresponding field in `TrainingArguments`.zTrial: zW&B Sweep parameters: z7For sweeps with deepspeed, `args.deepspeed` must be setr   )DeepSpeedPlugin)HfTrainerDeepSpeedConfig)rI  )!_trialr/  rY   r  r  r  r  r  rE  r	  r   r   r]  r   r  r  r   rW  r   r   r   r  r  r}  r  #transformers.integrations.deepspeedr   r  trainer_config_processrs  r   _reset_stater   )rE  rQ  rW  r  rM  old_attrr  r   s           r   rW  zTrainer._hp_search_setup  s   !!)U]!!_%;%;;]]5)F##':'::FJJw%##'<'<<F ,,. 	+JC499c*$SE *, , tyy#t4H#&Xu-DIIsE*	+ !!_%;%;;KK'%,,01!!_%:%::KK089$$yy""* !Z[[((* 9T,DTYYEXEX,YDII)II))@@K)8diiFcFc)dDII& ++-//1rp  r  c                 ~   | j                   |y|j                         }| j                  |      | _        | j                   t        j
                  k(  rddl}t        |d      r|j                  j                         sy|j                  | j                  |       |j                         rL| j                  j                  | j                  | j                  | j                           |j"                         yyy| j                   t        j$                  k(  rddl}t)        j*                         5 }d}| j                   j,                  r7| j/                  |       |j0                  j2                  j5                  |      }| j                  |d<   |j0                  j                  ||       ddd       yy# 1 sw Y   yxY w)zHReport intermediate metrics to the active hyperparameter search backend.Nr   study)checkpoint_dir	objective)r  )r/  r  r  r)  rY   r  optunar	  r'  _is_multi_objectivereportshould_pruner!  r  r   r,  r+  TrialPrunedr  r  tempfileTemporaryDirectoryr(  _tune_save_checkpointr  
Checkpointfrom_directory)rE  rQ  r  r  r*  r  temp_checkpoint_dirr  s           r   r  zTrainer._report_to_hp_search  sr    !!)U],,.//8!!_%;%;;ug&u{{/N/N/PT^^T2%%'))66tyy$**dll[,&,,.. ( 0Q&
 ##':'::,,. @2E!
<<++..>Q.R!$!4!4!C!CDW!XJ'+~~$J?@ @ ;@ @s   -A<F33F<r(  c                    t         j                  j                  |t         d| j                  j
                         }| j                  |d       | j                  j                  r| j                  j	                         | j                  j                  d<   | j                  j                  t         j                  j                  |t                     t        j                  | j                  j!                         t         j                  j                  |t"                     t        j                  | j$                  j!                         t         j                  j                  |t&                     yy)z@Save a checkpoint during a Ray Tune hyperparameter search trial.r  Tr  r@   N)r)  r[  r  rU   r,  r  r  r   r(  r+  r   r  r\  r   r#  r  r0  r2  r  r4  )rE  r(  r   s      r   r1  zTrainer._tune_save_checkpoint  s    WW\\.5J4K1TZZMcMcLd2ef

4899  >Bll>P>P>RDJJ))*:;JJ##BGGLL=O$PQJJt~~002BGGLL^4\]JJt((335rww||JP^7_` !rp  r  c                 :    | j                   j                  |       y)ag  
        Add a callback to the current list of [`~transformers.TrainerCallback`].

        Args:
           callback (`type` or [`~transformers.TrainerCallback]`):
               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
               first case, will instantiate a member of that class.
        N)r!  r"  rE  r  s     r   r"  zTrainer.add_callback  s     	**84rp  c                 8    | j                   j                  |      S )aK  
        Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it.

        If the callback is not found, returns `None` (and no error is raised).

        Args:
           callback (`type` or [`~transformers.TrainerCallback]`):
               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
               first case, will pop the first member of that class found in the list of callbacks.

        Returns:
            [`~transformers.TrainerCallback`]: The callback removed, if found.
        )r!  pop_callbackr7  s     r   r9  zTrainer.pop_callback  s     $$11(;;rp  c                 :    | j                   j                  |       y)a  
        Remove a callback from the current list of [`~transformers.TrainerCallback`].

        Args:
           callback (`type` or [`~transformers.TrainerCallback]`):
               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
               first case, will remove the first member of that class found in the list of callbacks.
        N)r!  r  r7  s     r   r  zTrainer.remove_callback  s     	--h7rp  c                 4    | j                   j                  dk(  S )z
        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
        machines) main process.
        r   )r   local_process_indexr  s    r   r   zTrainer.is_local_process_zero  s    
 yy,,11rp  c                 v    t               rt        j                         dk(  S | j                  j                  dk(  S )z
        Whether or not this process is the global main process (when training in a distributed fashion on several
        machines, this is only going to be `True` for one process).
        r   )r   r[  r  r   r  r  s    r   r   zTrainer.is_world_process_zero  s/     #$88:?"yy&&!++rp  c                     t        |dd      t        j                  d       y|j                  |      }| j                  j
                  t        j                  k(  rt        |d      r|j                          yyy)zJMove the model to the specified device, re-tying weights on XLA if needed.r   NzZThe model is already on multiple devices. Skipping the move to device specified in `args`.r  )
r   r   r]  r	  r   r   rs   TPUr	  r  )rE  r   r   s      r   r  zTrainer._move_model_to_device&  si    5/40<NNl  99""l&6&6675-;X <Y6rp  )NNNNNNNNNNr   NN)rS  Nr   )NFN)NNN)NNNNN)FN)T)TN)NNr  )Nr  )F)NF)	NNNNNNNNN)zEnd of trainingTNN)NN   minimizeNN)rQ  rR  rS  N)r   
__module____qualname____doc__trainer_pt_utilsr   r   r   r   r   r   r   r3   r   Modulert   r   r9   r#   r    r8   r   rX   r  r  r?   rF  r   r  	Optimizerr  LambdaLRr  r  r
   r)  rR  r  r  r   r   r  r   r  r  r  r  ri  rj  SamplerrX  r  r  r  r  r  r  r  r
  LRSchedulerr  staticmethodr  r  r  rO  r_   rc  r]  r  r  r  r   r
  AbstractContextManagerr!  r  r   r   r  r5  rB  r  r  r  r  r_  rg  rf  r  r  rW   r  r[   r  r  r  r  r,  r+  r  r  r  rY  r  r  r  r  r  r_  r  r  r  r  r  r'  r  r   r&  r  r  rY   rV   r  r   rW  r  r1  r"  r9  r  r   r   r  r  rp  r   r   r      sX   [|   59)--1OSQU
 <@-1CG26dp^bei%z7*T1z7  $&z7 $d*	z7
 Mz7 Oz7 2

 !  	z7 S/12T9z7 $d?z7 ">"2D"89D@z7 (4/z7  %++//$68P8P8Y8Y\`8``a!z7" #(U[[-B-B(CT#s(^(S"TW["[#z7$ (0u||0Lell0Z'[^b'b%z7xPd(4S> (Tlx`
j 
(%
g0D %
PZ %
N
 
J 
$Kz Kc K& LP!%)00 0 	0
 gY(8(8(@(@@ADH0 0 d
0 
0d0$ 0%++JZJZJbJbeiJi 0>g %++:J:J:R:RUY:Y :\" FJ!;)!;8;d
!;	!;F' 'WZ]aWa 'mu '$	E 	E 	EJ%++"7"7 JZ RV!"%!27++2G2G$2N!		!	!	-	-!< +< _W[E[ glmprumugv  <
 ryy 
 T#Y 
 E @ 598<15	j #d
T 1j 6j #3i$.	j
 
j\ "&)--18<15kH$JkH  $&kH !$d
	kH
 6kH #3i$.kH 
kHb 9=	E!yyE! S%,,,,-E! "LL3.5	E!
 
E!V  %8<Q;yyQ; S%,,,,-Q; 	Q;
 "LL3.5Q; 
ellC/0	0Q;f
j.B.B 
(D4K (S]StSt (  '+0^0^ <<%'$.0^ yy	0^
 60^ 0^ #3i$.0^ 0^ t|0^ 
0^f1&1581BG,,1	tU\\C'$..	/1"2"T 2"5<< 2"TYT`T`cfTfimTm 2"h5<<##5 %,,:L $d3s0B+B&C SRWR^R^adRdMdHe \.YY\.(,S%,,2D-D(E\.	xc5<<##5566	7\.|C
%C
3=C
WZC
	sCc4tS8	9C
JY/@ YS Y$S S S 1 1d 1z\`O` 1lnlulu 1n =A(,!'	cS'\ 22T9c #Y%c 	c
 
c5j	cR -1(,!'NuNu Nu #Tk	Nu
 #Y%Nu Nu 
Nub ek;t#;t26s)d2B;t^a;t	;tD )-g&yyg& S%,,,,-g& #	g&
 #Y%g& 
u||d"ELL4$79LL	Mg&Z  %	5 #3i$. 	
 
c5j	:%K PS &@bii @8^ @cg @D$"d3:.> $"Gm $"rv $"L1i# 1i$ 1if@1 @1 @1D1s 1t 1.=C =		TXHX =dh =BhT+_#* +_ +_Zf5d
 f5t f5P5sTz 5d 56%9Nc d  8[S4Z 8[ 8[Y] 8[t%Ld
 %Ltd{ %LVZ %LR_S%Z( _edl _d _4	"c5<<#3E.E)F 3 *%#* % %(  $"'+!%%)(,/3*./3Hh*Hh tHh DIo$	Hh
 $JHh d
Hh T#Y%Hh DIo,Hh tCy4'Hh DIo,Hh 
HhX &7 #P
d
P
 P
 Tz	P

 *P
 
P
dC9s C9t C9J4 IMHL%/04:>ON+T#u*-==>EO $T#u*%5$6$=>EO 	O
 c?O &-O >*C/047O 
4=	 Ob%K WYW`W` 22h@;@CF@QUVY[`V`Qa@	@6	aC 	aD 	a	5T/%:_%L 	5QU 	5<T/%:_%L <Q`cgQg < 	8_(=(O 	8TX 	82t 2	,t 	,
 299 
 ell 
 t 
 rp  r   (  rD  r  r  r  r   r  r`  r)  r  r  rb  r/  r  r  collections.abcr   r   r   r   pathlibr   typingr	   r
   integrationsr   huggingface_hub.utilsri  r^  r  r@  safetensors.torchr[  r   torch.distributeddistributedr  huggingface_hubr   r   r   r   	packagingr   r   torch.utils.datar   r   r   r   r   r  r   configuration_utilsr   data.data_collatorr   r   r   debug_utilsr   r   !feature_extraction_sequence_utilsr   feature_extraction_utilsr    r  r!   r"   image_processing_utilsr#   integrations.deepspeedr$   r%   r&   r'   r(   integrations.fsdpr)   r*   integrations.ligerr+   integrations.neftuner,   r-   integrations.peftr.   integrations.tpur/   r0   r1   	modelcardr2   modeling_utilsr3   r4   models.auto.modeling_autor5   r6   optimizationr7   processing_utilsr8   tokenization_utils_baser9   trainer_callbackr:   r;   r<   r=   r>   r?   r@   rA   trainer_optimizerrB   rC   rD   rE   rE  rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   trainer_utilsrU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   training_argsrr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   utils.import_utilsr   utils.quantization_configr   r  r$  utils.notebookr   r  torch_xla.core.xla_modelcore	xla_modelr  torch_xla.debug.metricsr#  r  r  torch_xla.runtimeruntimer;  	torch_xlaXLA_VERSIONr  r:  torch_xla.distributed.spmdspmdr=  !smdistributed.modelparallel.torchmodelparallelr[  r   r   r   r  r   r   r   r   accelerate.stater   r}  r   r   r   r   r   r   r   r   r   accelerate.utils.memoryr   r   r*  
get_loggerr   r   r  r\  r2  rA  r  r4  rE  r   r  rp  r   <module>r     sp         	   
    7 7   %
 -      M M   c c  1 \ \ < G < ` 6  M 2 F / [ [ & 9 ( , <	 	 	     "       > K J       : ) 9 )) , 8 8))))"4*W]];7=7==I_;``//" 33[[:1
 
 
 ;> 			H	% ) ) $ & 
p@  p@ p@ rp  