
    鬜ieB                        d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	  e	       rd dl
Z
 ej                  e      Z	 	 d d	e
j                  d
e
j                  de
j                  dz  dede
j                  f
dZde
j"                  j$                  de
j                  de
j                  de
j                  de
j                  f
dZ	 d!d	e
j                  d
e
j                  de
j                  dz  de
j                  fdZ	 	 	 d"d	e
j                  d
e
j                  de
j                  dz  de
j                  dz  dede
j                  fdZde
j"                  j$                  de
j                  de
j                  de
j                  de
j                  f
dZ G d de      Z e       Zde
j                  de
j                  fdZ	 d!ddddee
j"                  j$                     dz  dededee
j"                  j$                     fdZy)#    )Callable)wraps   )logging)GeneralInterface)is_torch_availableNFinputweightbiasis_transposedreturnc                     |r5t        j                  | j                  d      |      j                  d      }n4t        j                  || j                  d            j                  d      }|||z   }|S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
       )torchbmm	unsqueezesqueeze)r	   r
   r   r   outs        Y/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/integrations/moe.py_batched_linearr   D   se    * ii*F3;;A> ii 34<<R@DjJ    selfhidden_statestop_k_indextop_k_weightsc                 >   |j                   }|j                  d      }|j                  d      }|j                  d      }t        j                  ||      j	                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
|
| j                  k  }|
j                  d| j                  dz
        }||   }| j                  |   }| j                  |   }| j                  r| j                  |   nd }| j                  r| j                  |   nd }t        |||| j                        }| j!                  |      }t        |||| j                        }|	j"                  |j"                  k7  r|	j%                  d|      }	||	j	                  d      z  }||j	                  d      j'                  |j(                        z  }|j+                  |||      j-                  d      }|j'                  |j(                        S )Nr   r   devicer   r   dim)r   sizer   aranger   expandreshapenum_expertsclampgate_up_proj	down_projhas_biasgate_up_proj_biasdown_proj_biasr   r   _apply_gateshapegathertodtypeviewsum)r   r   r   r   r   	num_top_k
num_tokens
hidden_dim	token_idxsample_weights
expert_ids
valid_maskexpert_ids_clampedselected_hidden_statesselected_gate_upselected_downselected_gate_up_biasselected_down_biasgate_up_out	gated_outout_per_samplefinal_hidden_statess                         r   batched_mm_experts_forwardrF   f   s    !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J d...J#))!T-=-=-AB +95 (();<NN#56MJN--D223EF]aDHMM,,-?@W[ " 02GW[WiWiK
   -I %="4DDVDVN
 1777'..q2DE#n&>&>r&BBN#j&:&:2&>&A&A.BVBV&WWN )--j)ZPTTYZT[!!-"5"566r   offsc                 l   t        t        j                  j                  d      rEt        j                  j                  j	                  | j                  |j                        ||      S t        t        d      r1t        j                  | j                  |j                        ||      S t        d      )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.
    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`, *optional*):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    Raises:
        ImportError: If neither `torch.nn.functional.grouped_mm` nor `torch._grouped_mm` is available, indicating that the PyTorch version is incompatible.
    
grouped_mmrG   _grouped_mmzNeither torch.nn.functional.grouped_mm nor torch._grouped_mm is available. Please make sure you are using a PyTorch version that includes grouped_mm (2.9+).)	hasattrr   nn
functionalrI   r1   r2   rK   ImportError)r	   r
   rG   s      r   rK   rK      s    , uxx""L1xx""--ehhv||.DfSW-XX		&  &,,!7dKK`
 	
r   c                 r    |rt        | ||      }nt        | |j                  dd      |      }|||z   }|S )a*  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, output_dim, input_dim) if transposed is `False`,
            else of shape (num_experts, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        offs (`torch.Tensor`, *optional*):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rJ   r   )rK   	transpose)r	   r
   r   rG   r   r   s         r   _grouped_linearrS      sF    0 %d3 %!1!1"b!9EDjJr   c                    |j                   }|j                  d      }|j                  d      }|j                  d      }t        j                  ||      j	                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
||   }t        j                  |
      }t        j                  |      }|
|   }|	|   }||   }| j                  }| j                  }| j                  r| j                  |   nd }| j                  r| j                  |   nd }|j                  dk(  r|j                         n|j                         }t        j                   || j"                  d| j"                  dz
        }t        j$                  |dt        j&                        }t)        ||||| j*                        }| j-                  |      }t)        ||||| j*                        }||j	                  d      z  }||   }|j/                  |||      j1                  d	      }|j3                  |j4                        S )
Nr   r   r   r   cpu)binsminmax)r"   r2   r    r!   )r   r#   r   r$   r   r%   r&   argsortr)   r*   r+   r,   r-   typefloatinthistcr'   cumsumint32rS   r   r.   r3   r4   r1   r2   )r   r   r   r   r   r5   r6   r7   r8   r9   r:   r=   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_gr>   r?   r@   rA   histc_inputnum_tokens_per_expertoffsetsrB   rC   out_per_sample_grD   rE   s                                r   grouped_mm_experts_forwardri      sE    !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J +95 ==$D}}T"Hd#L%d+5d; ((NNMDHMMD22<@W[>Bmm,,\:QU
 +1++*>,$$&LDTDTDVK!KK$:J:JPQW[WgWgjkWklll0au{{KG " "24I7bfbtbtK
   -I '="4gTM_M_
 (*:*D*DR*HH &h/N )--j)ZPTTYZT[!!-"5"566r   c                   :     e Zd ZdZeedZdededef fdZ	 xZ
S )ExpertsInterfacez9Interface for registering custom experts implementations.)
batched_mmrI   experts_implementationdefaultr   c                     |t         j                  d       n|dk7  r|| vrt        d| d      t        |   ||      S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.a
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r   rm   rn   	__class__s      r   get_interfacezExpertsInterface.get_interface4  s`    !)N
 $w.3IQU3U*++wx  w{17;;r   )__name__
__module____qualname____doc__rF   ri   _global_mappingstrr   rx   __classcell__)rw   s   @r   rk   rk   ,  s4    C 10O
<C <( <x < <r   rk   rB   c                 V    |j                  dd      \  }}| j                  |      |z  S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r!   )chunkact_fn)r   rB   gateups       r   _default_apply_gater   F  s1        +HD";;tr!!r   )r   r+   experts_classr+   c                    dt         t        j                  j                     dt         t        j                  j                     ffd}|  ||       S |S )aV  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r   r   c                     | j                   | j                  t              fd       }t              fd       }t        | d      st        | _        || _         || _        | S )Nc                 J     | |g|i | || _         | _        | _        y N)configr+   r   )r   r   argskwargsr+   r   original_inits       r   __init__z=use_experts_implementation.<locals>.wrapper.<locals>.__init__i  s-    $888 DK$DM!.Dr   c                 p    t         j                  | j                  j                        } || g|i |S r   )ALL_EXPERTS_FUNCTIONSrx   r   _experts_implementation)r   r   r   experts_forwardoriginal_forwards       r   forwardz<use_experts_implementation.<locals>.wrapper.<locals>.forwardp  s:    3AA335EO #49$9&99r   r.   )r   r   r   rL   r   r.   )r   r   r   r   r   r+   r   s      @@r   wrapperz+use_experts_implementation.<locals>.wrappere  su    %..(00	}		/ 
	/ 
	 	: 
!	: }m4(;M%!) 'r   )rZ   r   rM   Module)r   r   r+   r   s    `` r   use_experts_implementationr   T  sH    "tEHHOO4 ehhoo9N 0  }%%Nr   )NFr   )NNF)collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr   r   
get_loggerry   rr   Tensorboolr   rM   r   rF   rK   rS   ri   rk   r   r   rZ   r    r   r   <module>r      sU   %   , 3 			H	%Z !%	<<LL ,,
 	
 \\D67
((//67<<67 67 <<	67
 \\67x !%
<<
LL
 ,,

 \\	
H !% $#<<#LL# ,,
# ,,
	#
 # \\#LC7
((//C7<<C7 C7 <<	C7
 \\C7L<' <. )* "5<< "ELL " 37,QVin,(4/,JN,bf,	%((//,r   