
    謜i                         d dl Z d dlZd dlZddlmZ ddlmZ ddlmZ  ej                  e
      Z G d d      Z G d d	e      Zy)
    N   )TrainerCallback)PREFIX_CHECKPOINT_DIR)loggingc                   2    e Zd ZddefdZd Zd Zd Zd Zy)	CheckpointManager	kill_waitc                 <    || _         d| _        d| _        || _        y)aD  
        Initialize the CheckpointManager for Just-In-Time checkpoint handling.

        Args:
            trainer: The Trainer instance that will be used to save checkpoints when SIGTERM is received.
            kill_wait (`int`, *optional*, defaults to 3): Grace period to distinguish between SIGTERM and SIGKILL.
        FN)traineris_checkpoint_requested_original_sigterm_handlerr	   )selfr   r	   s      _/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/trainer_jit_checkpoint.py__init__zCheckpointManager.__init__   s"     ',$)-&"    c                     t        j                   t         j                  | j                        | _        t        j                  d       y )Nz4JIT checkpoint signal handler registered for SIGTERM)signalSIGTERM_sigterm_handlerr   loggerinfor   s    r   setup_signal_handlerz&CheckpointManager.setup_signal_handler   s,    )/v~~tG\G\)]&JKr   c                     | j                   ry t        j                  d| j                   d       t	        j
                  | j                  | j                        j                          y )Nz4SIGTERM received, will request JIT checkpoint after s)r   r   r   r	   	threadingTimer_enable_checkpointstart)r   signumframes      r   r   z"CheckpointManager._sigterm_handler   sK    ''J4>>JZZ[\](?(?@FFHr   c                 <    t         j                  d       d| _        y )Nz/Kill wait period elapsed, requesting checkpointT)r   r   r   r   s    r   r   z$CheckpointManager._enable_checkpoint&   s    EF'+$r   c                    	 d| _         t        j                  d       | j                  j                  j
                  }t        j                  d|        | j                  j                  d       }t         d| }t        j                  j                  ||      }t        j                  |d       t        j                  j                  ||d      }t        |d	      5 }|j                  d
| d       d d d        t        j                  d|        | j                  j                  | j                  j                  d        t        j                  j!                  |      r*t        j"                  |       t        j                  d       t        j                  d       y # 1 sw Y   xY w# t$        $ r}t        j'                  d|         d }~ww xY w)NFzStarting JIT checkpointing...zSaving JIT checkpoint at step )trial-T)exist_okzcheckpoint-is-incomplete.txtwzCheckpoint started at step z and in progress...z2Created checkpoint progress sentinel marker file: zSentinel marker file removedz/Immediate JIT checkpoint completed successfullyzFailed to save JIT checkpoint: )r   r   r   r   stateglobal_step_get_output_dirr   ospathjoinmakedirsopenwrite_save_checkpointmodelexistsremove	Exceptionerror)r   current_step
output_dircheckpoint_foldercheckpoint_pathsentinel_filefes           r   execute_jit_checkpointz(CheckpointManager.execute_jit_checkpoint*   s   !	+0D(KK78<<--99LKK8GH55D5AJ#8"9<. I ggll:7HIO KK$7 GGLL5FHfgMmS) YQ5l^CVWXYKKL]O\] LL))$,,*<*<D)I ww~~m,		-(:;KKIJY Y  	LL:1#>?	s1   CF3 !F'7B/F3 'F0,F3 3	G<GGN)   )	__name__
__module____qualname__intr   r   r   r   r>    r   r   r   r      s$    #3 #LI,"r   r   c                   :    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
y	)
JITCheckpointCallbackaN  
    Callback for Just-In-Time checkpointing on SIGTERM signals.

    When SIGTERM is received, the checkpoint manager sets `is_checkpoint_requested=True`.
    The callbacks detect this flag and set `control.should_training_stop=True`, which signals
    the Trainer's training loop to exit gracefully after saving the checkpoint.
    c                      d | _         d | _        y )N)r   jit_managerr   s    r   r   zJITCheckpointCallback.__init__X   s    59r   c                     || _         |j                  j                  rAt        |      | _        | j                  j                          t        j                  d       y y )N)r   zJIT checkpointing enabled)r   argsenable_jit_checkpointr   rH   r   r   r   )r   r   s     r   set_trainerz!JITCheckpointCallback.set_trainer\   sG    <<--0AD113KK34 .r   c                     | j                   r9| j                   j                  r"d|_        | j                   j                          y y y NTrH   r   should_training_stopr>   r   rJ   r(   controlkwargss        r   on_pre_optimizer_stepz+JITCheckpointCallback.on_pre_optimizer_stepc   <     0 0 H H+/G(335 !Ir   c                     | j                   r9| j                   j                  r"d|_        | j                   j                          y y y rN   rO   rQ   s        r   on_step_beginz#JITCheckpointCallback.on_step_beginh   rU   r   c                     | j                   r@| j                   j                  r)d|_        d|_        | j                   j	                          y y y NFTrH   r   should_saverP   r>   rQ   s        r   on_step_endz!JITCheckpointCallback.on_step_endm   D     0 0 H H"'G+/G(335 !Ir   c                     | j                   r@| j                   j                  r)d|_        d|_        | j                   j	                          y y y rY   rZ   rQ   s        r   on_epoch_endz"JITCheckpointCallback.on_epoch_ends   r]   r   c                     | j                   re| j                   j                  Nt        j                  t        j                  | j                   j                         t        j                  d       y y y )Nz;Restored original SIGTERM handler after training completion)rH   r   r   r   r   r   rQ   s        r   on_train_endz"JITCheckpointCallback.on_train_endy   sP     0 0 J J VMM&..$*:*:*T*TUKKUV !Wr   N)r@   rA   rB   __doc__r   rL   rT   rW   r\   r_   ra   rD   r   r   rF   rF   O   s+    :56
6
66Wr   rF   )r+   r   r   trainer_callbackr   trainer_utilsr   utilsr   
get_loggerr@   r   r   rF   rD   r   r   <module>rg      sD    	   - 0  
		H	%? ?D.WO .Wr   