
    6i                         d dl mZ ddlmZmZmZmZ ddlmZ ddl	m
Z
  e       rd dlZerddlmZ  ej                  e      Z G d	 d
e      Zy)    )TYPE_CHECKING   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                        e Zd ZdZdZ fdZd Zdddedefd	Z	ddded
dde
f fdZ	 	 ddZd Zd Zedefd       Zd Zd Z xZS )FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    Fc                 &    t        |   |fi | y )N)super__init__)selfquantization_configkwargs	__class__s      m/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   z"FineGrainedFP8HfQuantizer.__init__   s    ,77    c                 *   t               st        d      | j                  j                  ry t        j
                  j                         sHt               s>| j                  r't        j                  d       d| j                  _        y t        d      t        j
                  j                         r`t        j
                  j                         }|\  }}|dk  s
|dk(  r3|dk  r.t        j                  d| d| d	       d| j                  _        y |j                  d
      }|t        j                  d       y t        |t              rJ| j                  s t!        |      dkD  rd|j#                         v sd|j#                         v rt%        d      y y )NzMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)zUsing FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is availableTzANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.z`. We will default to dequantizing the model to bf16. Feel free to use a different quantization method like bitsandbytes or torchao
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. r	   cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   
dequantizetorchcudais_availabler   pre_quantizedloggerwarning_onceRuntimeErrorget_device_capabilityget
isinstancedictlenvalues
ValueError)r   argsr   compute_capabilitymajorminorr   s          r   validate_environmentz.FineGrainedFP8HfQuantizer.validate_environment   s{   &(mnn##..zz&&(1G1I!!## [ 7;((3"#fgg::""$!&!A!A!C-LE5	uzeai####('5' 2Z[
 7;((3ZZ-
6
 
D)&&
Oa'Z..00Z..00 k  1 *r   modelr   
param_namereturnc                 t    ddl m}m} t        ||      \  }}t	        |||f      r| j
                  s|dk(  ryyy)Nr   )	FP8Expert	FP8LinearbiasFT)integrations.finegrained_fp8r8   r9   r   r*   r$   )r   r4   r5   r   r8   r9   moduletensor_names           r   param_needs_quantizationz2FineGrainedFP8HfQuantizer.param_needs_quantizationM   s>    G25*Efy)45!![F%:r   paramztorch.Tensorc                 L    | j                  ||      ryt        | 	  |||      S )z4Return the element size (in bytes) for `param_name`.r	   )r>   r   param_element_size)r   r4   r5   r?   r   s       r   rA   z,FineGrainedFP8HfQuantizer.param_element_sizeX   s*    ((
;w)%UCCr   c                     ddl m} | j                  || j                  j                  |j
                        | _         ||| j                  | j                  | j                        }y )Nr   )replace_with_fp8_linear)modules_to_not_convertr   r$   )r;   rC   get_modules_to_not_convertr   rD   _keep_in_fp32_modulesr$   )r   r4   r   rC   s       r   $_process_model_before_weight_loadingz>FineGrainedFP8HfQuantizer._process_model_before_weight_loading_   s^    
 	K&*&E&E4++BBED_D_'
# (#'#>#> $ 8 8,,	
r   c                 f    d|j                   j                  v rddddddddddddddd}||_        |S )NQwen3colwiserowwise)z layers.*.self_attn.q_proj.weightz*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightz*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_inv)r   __name__base_model_tp_plan)r   config	text_plans      r   update_tp_planz(FineGrainedFP8HfQuantizer.update_tp_planq   sV    f&&///4=>G4=>G4=>G4=>G1:;D/89B1:;DI" )2F%r   c                      y)NT r   s    r   is_serializablez)FineGrainedFP8HfQuantizer.is_serializable   s    r   c                      y)NFrR   rS   s    r   is_trainablez&FineGrainedFP8HfQuantizer.is_trainable   s    r   c                     ddl m}  ||       S )Nr   )Fp8Quantize)r;   rX   )r   rX   s     r   get_quantize_opsz*FineGrainedFP8HfQuantizer.get_quantize_ops   s    >4  r   c                     ddl m} ddlm} | j                  r+| j
                  j                  r |g dd ||       g      gS g S )Nr   )WeightConverter)Fp8Dequantize)zweight$weight_scale_invactivation_scaleweight)source_patternstarget_patterns
operations)core_model_loadingr[   r;   r\   r$   r   r    )r   r[   r\   s      r   get_weight_conversionsz0FineGrainedFP8HfQuantizer.get_weight_conversions   sK    8@$":":"E"E  $W$, -d 34  	r   )r4   r   )rL   
__module____qualname____doc__requires_calibrationr   r3   strboolr>   floatrA   rG   rP   rT   propertyrV   rY   rd   __classcell__)r   s   @r   r   r      s    
 !8/b	.? 	S 	_c 	D(9 Ds DSa Dfk D
 
$. d  !
r   r   )typingr   utilsr   r   r   r   baser
   quantizers_utilsr   r!   modeling_utilsr   
get_loggerrL   r%   r   rR   r   r   <module>rt      sE      ` `  2 0			H	%Q Qr   