
    6i-                         d dl mZ ddlmZ erddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ  e	       r
d dlZdd	lmZ  ej"                  e      ZdZ G d
 de      Zy)    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameN)WeightConverterc                        e Zd ZdZdZ fdZd Zd Zddded	e	fd
Z
ddZ	 dddde	fdZd Zd Zd Zd Zed	e	fd       Zd Zd Z xZS )Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    Fc                 4    t        |   |fi | d | _        y N)super__init__triton_kernels_hub)selfquantization_configkwargs	__class__s      c/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__0   s    ,77"&    c                     | j                    	 ddlm}  |d      | _         | j                   S | j                   S # t        $ r t        d      w xY w)z3Lazy import and initialize kernels only when neededr   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsz2kernels package is required for MXFP4 quantization)r   integrations.hub_kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels4   s]    ""*XA*45_*`' &&&t&&&  X!"VWWXs	   9 Ac                 r   t               st        d      | j                  j                  ry t        j
                  j                         s\t        j                  j                         s>| j                  r't        j                  d       d| j                  _        y t        d      t               st        d      t        j                  j                         rd}t        d      xr
 t               }n:t        j
                  j                         }|dk\  }t        d      xr
 t               }| j                  rR|s't        j                  d	       d| j                  _        y |sAt        j                  d
       d| j                  _        y |st!        d      |st!        d      | j                  s| j#                          |j%                  d      }|t        j                  d       y t'        |t(              r=| j                  s0d|j+                         v sd|j+                         v rt!        d      y y y )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r
   r   r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr   r   r	   get_device_capability
ValueErrorr    get
isinstancedictvalues)r   argsr   gpu_is_supportedkernels_availablecompute_capabilityr$   s          r   validate_environmentz%Mxfp4HfQuantizer.validate_environment?   s   !#] 
 ##..zz&&(1G1G1I!!##t 7;((3"#RSS&(YZZ99!!## 3G < WAUAW!&!A!A!C1V; 3G < WAUAW###I 7;((3$##  7;((3! r  # H  !!%%'ZZ-
V 
D)%%5J4E4E4G+G6U_UfUfUhKh n  Li% *r   modelr   
param_namereturnc                 R    ddl m} t        ||      \  }}t        ||      r|dv ryyy)Nr   Mxfp4GptOssExperts)down_proj_biasgate_up_proj_biasFT)integrationsr@   r   r3   )r   r;   r<   r   r@   moduletensor_names          r   param_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantization   s3    525*Ef01EEr   c                     t         j                  j                         rt         j                  j                          y t         j                  j                         rt         j                  j                          y y r   )r(   r)   r*   empty_cacher+   )r   r;   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading   sG    ::""$JJ""$YY##%II!!# &r   use_kernelsc                 
   ddl m} |r&t        j                  d       d| j                  _        | j                  || j                  j                  |j                        | _         ||| j                  | j                        }y )Nr   )replace_with_mxfp4_linearzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   )	rC   rL   r-   r.   r   r'   get_modules_to_not_convertrM   _keep_in_fp32_modules)r   r;   rJ   r   rL   s        r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading   s|     	= e 37D$$/&*&E&E4++BBED_D_'
# *$*E*E[_[s[s
r   c                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrS   updater   configs     r   update_tp_planzMxfp4HfQuantizer.update_tp_plan   R    V--666v3T:F))00DRDRAOAO	 r   c                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NrR   base_model_ep_planrT   rU   )r   rV   rW   r^   rX   rY   s     r   update_ep_planzMxfp4HfQuantizer.update_ep_plan   r\   r   c                    ddl m} |j                         }t        |j                  dd      }t        |j                  dd      }|j                         D ]  \  }}t        ||      st        |d      s!t        |d      s.|j                  j                  j                  j                  |j                  j                  j                        j                  d	d
      j                  |d	dd      || d<   |j                  j                   j                  j                  j                  |j                  j                   j                  j                        j                  d	d
      || d<   |j"                  j                  j                  j                  |j"                  j                  j                        j                  d	d
      j                  ||dd	      || d<   |j$                  j                   j                  j                  j                  |j$                  j                   j                  j                        j                  d	d
      || d<    i }||fS )Nr   r?   num_local_experts    hidden_sizei@  gate_up_proj	down_projZ      z.gate_up_proj_blocksz.gate_up_proj_scalesz.down_proj_blocksz.down_proj_scales)rC   r@   
state_dictrW   rZ   named_modulesr3   hasattrrd   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configweight_scalere   down_proj_precision_config)	r   r;   r@   rj   ra   rc   namerD   metadatas	            r   get_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata   s   5%%'
 $ELL2ErJellM4@!//1 	LD&6#56FN3FK0 ''//66EEfFYFYFaFaFfFfgYr2&W.B; dV#789 88EEMMTTcc<<IIQQVViB' dV#789 $$,,33BB6CSCSC[C[C`C`aYr2&W.RD dV#456 55BBJJQQ``99FFNNSSiB' dV#456+	6 8##r   c                      y)NT r   s    r   is_serializablez Mxfp4HfQuantizer.is_serializable   s    r   c                 .    t         j                  d       y)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r-   r.   r{   s    r   is_trainablezMxfp4HfQuantizer.is_trainable   s     x	
 r   c                     ddl m}  ||       S )Nr   )Mxfp4Quantize)integrations.mxfp4r   )r   r   s     r   get_quantize_opsz!Mxfp4HfQuantizer.get_quantize_ops   s    6T""r   c                     ddl m}m} | j                  rF| j                  j
                  rt        ddgd ||       g      gS t        ddgd ||       g      gS g S )Nr   )Mxfp4DequantizeMxfp4Deserialize_blocks_scales )source_patternstarget_patterns
operations)r   r   r   r,   r   r'   r   )r   r   r   s      r   get_weight_conversionsz'Mxfp4HfQuantizer.get_weight_conversions  sv    J''22#)2I(>(*$3D$9#:  $)2I(>(*$4T$:#;  	r   )r;   r   )F)rV   
__module____qualname____doc__requires_calibrationr   r    r:   strboolrF   rI   rP   r[   r_   rx   r|   propertyr~   r   r   __classcell__)r   s   @r   r   r   )   s     !'	'IV.? S _c $ "
 
 
0%$N d  #
r   r   )typingr   baser   modeling_utilsr   utilsr   r	   r
   r   r   quantizers_utilsr   r(   core_model_loadingr   
get_loggerrV   r-   r   r   rz   r   r   <module>r      sX    !  0  3 4			H	% n{ nr   