
    鬜i&                     ~    d dl Z d dlZd dlZd dlmZ ddlmZmZmZm	Z	  e	j                  e      ZdefdZd Zd	dZy)
    N)
DataLoader   )WEIGHTS_NAMEPushToHubMixinis_torch_xla_availablelogging
dataloaderc                     t               r`dd lmc m} t	        | |j
                        sJ d       dd lmc m} |j                  |j                         d      }|| j                  d<   | S | S )Nr   zPThe dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`.)fsdpNinput_sharding)r   %torch_xla.distributed.parallel_loaderdistributedparallel_loader
isinstanceMpDeviceLoadertorch_xla.distributed.spmdspmdShardingSpecget_global_mesh_parallel_loader_kwargs)r	   plxssharding_specs       Y/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/integrations/tpu.pytpu_spmd_dataloaderr      sl    ::*b&7&78 	
^	
8 	0/(:(:(<nM?L
**+;<    c                 2   ddl mc m ddlmc m ddlm} 	 ddlm	 ddlm
 ddlm}m} rddlm d}d}t#        | d
d      }|j$                  j'                  d|      }	|j$                  d   dkD  r%t)        j*                  ||j$                  d         }nQ|	Ot-               }
|	D ])  } || |      }|t/        d      |
j1                  |       + t)        j*                  ||
      }|j2                  }|j$                  d   rD| j4                  j6                  r&t8        j;                  d       d| j4                  _        fd}rfd} | |||      } n | f||d|} di ffd	}|_        | S # t         $ r t!        d	      w xY w)a.  
    Wraps a model with XLA Fully Sharded Data Parallelism (FSDP).

    Handles both FSDP v1 (`XlaFullyShardedDataParallel`) and v2 (`SpmdFullyShardedDataParallel`),
    including auto-wrap policies, gradient checkpointing, and patching `xm.optimizer_step`.

    Args:
        model (`torch.nn.Module`): The model to wrap.
        args (`TrainingArguments`): The training arguments containing FSDP configuration.
        is_fsdp_xla_v2_enabled (`bool`): Whether FSDP v2 (SPMD) is enabled.

    Returns:
        `torch.nn.Module`: The FSDP-wrapped model.
    r   Nr   )get_module_class_from_name)XlaFullyShardedDataParallel)checkpoint_module)size_based_auto_wrap_policytransformer_auto_wrap_policy)SpmdFullyShardedDataParallelzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0._no_split_modulestransformer_layer_cls_to_wrapmin_num_params)r&   z@Could not find the transformer layer class to wrap in the model.)transformer_layer_clsxla_fsdp_grad_ckptzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fc                 4    sn} | |       g|i |S N )margskwargs
target_clsFSDPFSDPv2r    is_fsdp_xla_v2_enableds       r   auto_wrapper_callablez2wrap_model_xla_fsdp.<locals>.auto_wrapper_callablet   s'    %;J/2DTDVDDr   c                     ddl m} d }t        | t        j                        r| }n.t        | t
              r| d   }nt        | |      r| j                  }|t        d      j                  ||d       y )Nr   )CausalLMOutputWithPastr   zASomething went wrong, the output of the model shouldn't be `None`)r   NN)	modeling_outputsr5   r   torchTensortuplelogits
ValueErrormark_sharding)outputmeshr5   real_outputr   s       r   shard_outputz)wrap_model_xla_fsdp.<locals>.shard_output{   sj    AK&%,,/$FE*$QiF$:;$mm" !dee[$0DEr   )r@   auto_wrap_policyr3   )rA   r3   c                 P     | j                   di |}|rj                          |S )Nr+   )step	mark_step)	optimizerbarrieroptimizer_argslossxms       r   patched_optimizer_stepz3wrap_model_xla_fsdp.<locals>.patched_optimizer_step   s'    y~~//LLNr   )torch_xla.core.xla_modelcore	xla_modelr   r   r   trainer_pt_utilsr   torch_xla.distributed.fsdpr   r    torch_xla.distributed.fsdp.wrapr!   r"   7torch_xla.experimental.spmd_fully_sharded_data_parallelr#   ImportErrorgetattrfsdp_configget	functoolspartialset	Exceptionaddxla_fsdp_configconfig	use_cacheloggerwarning_onceoptimizer_step)modelr-   r2   r   r!   r"   rA   r3   %default_transformer_cls_names_to_wrap"fsdp_transformer_layer_cls_to_wraptransformer_cls_to_wraplayer_classtransformer_clsfsdp_kwargsr@   rJ   r0   r1   r    rI   r   s     `             @@@@@r   wrap_model_xla_fsdprh   .   s    *)++=hR@	

 "  ,3E;NPT,U))-)9)9)=)=')N*& ()A-$,,'8H8HIY8Z
 
,	7"%%= 	=K8LO& bcc'++O<	= %,,("9
 &&K,-<<!!j &+ELL"	E
 	F %-"7	
 
-"7
 	
 38  /BLi  hfgghs   F Fc           	         ddl mc m} ||n|j                  }t        j                  d|        |j                          |j                  d      rKt        j                  |d       t        j                  |t        j                  j                  |d             t        f}|j                  d	       |r`| j!                         | j#                         d
}t        j                  j                  |d|j$                   d|j&                   dt(               }	|j                  ||	d       |j                  d       |j*                  rddlm}
  |
t        j                  j                  |d      dt(         d      \  }}| j0                  j0                  } |j3                  |       }t5        ||      r|j7                  ||       nat        j                  d       |j                  |t        j                  j                  |t(                     nt5        | |      st5        |j3                  |       |      rK|j3                  |       j7                  ||j*                  |j9                  | j!                                      nt        j                  d       |j9                  | j!                               }|j                  |t        j                  j                  |t(                     n;| j7                  ||j*                  |j9                  | j!                                      ||j*                  r|j7                  |       yyy)a  
    Saves a model checkpoint on TPU/XLA devices.

    Handles FSDP v1 sharded checkpoints (with consolidation on master), as well as
    standard XLA model saving via `save_pretrained` or `xm.save`.

    Args:
        model (`torch.nn.Module`): The model to save.
        args (`TrainingArguments`): The training arguments.
        accelerator (`Accelerator`): The accelerator instance.
        processing_class: The processing class (tokenizer/processor) to save alongside the model.
        is_fsdp_xla_v1_enabled (`bool`): Whether FSDP XLA v1 is enabled.
        output_dir (`str`, *optional*): The directory to save to. Defaults to `args.output_dir`.
    r   NzSaving model checkpoint to F)localT)exist_okztraining_args.binsaving_checkpoint)ra   shard_metadatarankz-of--)master_onlysave_full_checkpoints)%consolidate_sharded_model_checkpoints zrank*-of-*-)ckpt_prefixckpt_suffix
save_model)
state_dictzETrainer.model is not a `PreTrainedModel`, only saving its state dict.)is_main_processrw   )rK   rL   rM   
output_dirr^   inforD   is_master_ordinalosmakedirsr7   savepathjoinr   
rendezvousrw   get_shard_metadataprocess_index
world_sizer   should_saverO   rr   moduleunwrap_modelr   save_pretrained_maybe_convert_to_cpu)ra   r-   acceleratorprocessing_classis_fsdp_xla_v1_enabledry   rI   supported_classesckpt	ckpt_pathrr   full_state_dict_unwrapped_modelrw   s                  r   save_tpu_checkpointr      s    *))54??J
KK-j\:;LLN	%(
J.

4j2EFG ()MM%&%%'#668
 GGLLtD4F4F3GtDOOK\\]^j]k-lm	
iU3
-.X!FGGLLR8),8 "OQ
 LL''E)66u=O/+<=//
/Wcdj,)OP01k..u57HI$$U+;; $ 0 033E4D4D4FG <  KK_`11%2B2B2DEJGGJZ FG ,,//0@0@0BC 	 	

 #(8(8((4 )9#r   r*   )rV   r|   r7   torch.utils.datar   utilsr   r   r   r   
get_logger__name__r^   r   rh   r   r+   r   r   <module>r      sF     	  ' Q Q 
		H	%J &tnJ5r   