
    謜iU                        d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
  ej                  e      ZdZe G d	 d
             Z	 ddej                   dz  dej"                  ez  ez  dej                   dededz  f
dZ	 ddej                   dz  dej"                  ez  ez  dej                   dededz  f
dZddej                   dej.                  dedz  fdZddej                   dej.                  dedz  fdZ	 	 ddej"                  ez  ez  dej.                  dej4                  dededz  dej                   dz  fdZy)a  
IMPORTANT NOTICE: Every class and function in this file is deprecated in favor of using the much more general
`masking_utils.py` primitives. New code should not rely on it, it is only kept for backward compatibility for now,
and will be removed in the future.
    )	dataclass)UnionN   )logging)is_torchdynamo_compiling
is_tracingzThe attention mask API under `transformers.modeling_attn_mask_utils` (`AttentionMaskConverter`) is deprecated and will be removed in Transformers v5.10. Please use the new API in `transformers.masking_utils`.c                   ^   e Zd ZU dZeed<   eed<   ddededz  fdZ	 ddededed	ej                  d
e
ej                  df   dej                  dz  fdZ	 ddej                  ded	ej                  dedz  dej                  f
dZe	 	 ddej                   d	ej                  d
ej                  dededz  f
d       Zeddej                  d	ej                  dedz  fd       Zedej&                  defd       Ze	 	 d dej                  dz  dej                  dededz  dedefd       Zy)!AttentionMaskConvertera9  
    A utility attention mask class that allows one to:
        - Create a causal 4d mask
        - Create a causal 4d mask with slided window
        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
          key_value_length) that can be multiplied with attention scores

    Examples:

    ```python
    >>> import torch
    >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

    >>> converter = AttentionMaskConverter(True)
    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
    tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
    ```

    Parameters:
        is_causal (`bool`):
            Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

        sliding_window (`int`, *optional*):
            Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
    	is_causalsliding_windowNc                     t         j                  t        t               || _        || _        | j
                  )| j
                  dk  rt        d| j
                   d      y y )Nr   zaMake sure that when passing `sliding_window` that its value is a strictly positive integer, not ``)loggerwarning_onceDEPRECATION_MESSAGEFutureWarningr   r   
ValueError)selfr   r   s      a/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/modeling_attn_mask_utils.py__init__zAttentionMaskConverter.__init__I   sy    /?",*t/B/Ba/Gstx  uH  uH  tI  IJ  K  0H*    
batch_sizequery_lengthkey_value_lengthdtypedevicestrreturnc                     | j                   st        d| j                   d      ||f}||z
  }d}|d   dkD  s| j                   | j	                  ||||| j                        }|S )z
        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
        bias to upper right hand triangular matrix (causal mask).
        z"Please use `to_causal_4d` only if z has `is_causal` set to True.Nr   r   past_key_values_lengthr   )r   r   	__class__r   _make_causal_mask)	r   r   r   r   r   r   input_shaper"   causal_4d_masks	            r   to_causal_4dz#AttentionMaskConverter.to_causal_4dT   s     ~~A$..AQQnopp "<0!1L!@ r?Q$"5"5"A!33'=#22 4 N r   attention_mask_2dc                    |j                   d   |f}d}|d   dkD  s| j                  I| j                  r=|t        d      ||z
  }| j	                  |||j
                  || j                        }n| j                  t        d      | j                  |||d         j                  |j
                        }|=|j                  |j                         t        j                  |      j                        }|}	|	S )	a  
        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
        causal, a causal mask will be added.
        r   Nr    r   zpThis attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask.r!   z?Sliding window is currently only implemented for causal masking)tgt_len)shaper   r   r   r$   r   NotImplementedError_expand_masktomasked_fillbooltorchfinfomin)
r   r(   r   r   r   r%   r&   r"   expanded_attn_maskexpanded_4d_masks
             r   to_4dzAttentionMaskConverter.to_4du   s)    )..q1<@ Oa4#6#6#B'  G  &6%D"!33(//'=#22 4 N   ,%&ghh "../@%Q\]_Q`.add$$
 %!/!;!;<N<S<S<UW\WbWbchWiWmWm!n .r   input_ids_shaper"   c                 Z   t         j                  t        t               | \  }}t	        j
                  ||ft	        j                  |      j                  |      }t	        j                  |j                  d      |      }|j                  ||dz   j                  |j                  d      d      k  d       |j                  |      }|dkD  r0t	        j                  t	        j                  ||||      |gd      }|||z
  dz
  }	t	        j                  t	        j                   |t        j"                        |		      }
t%               r|j'                         }|j                  |
t	        j                  |      j                         |ddddddf   j)                  |d|||z         S )
zJ
        Make causal mask used for bi-directional self-attention.
        )r   r    r   r   r   r   )dimNr   )diagonal)r   r   r   r   r1   fullr2   r3   arangesizemasked_fill_viewr.   catzerostril	ones_liker0   r   cloneexpand)r7   r   r   r"   r   bszr*   mask	mask_condr<   context_masks              r   r$   z(AttentionMaskConverter._make_causal_mask   s\    	/?&Wzz7G,ekk%.@.D.DVTLL2v>	)y1}&:&:499R=!&LLaPwwu~!A%99ekk'3IQV_efhlmsuvD %->BH ::eood%**&MX`aL ()zz|lEKK,>,B,BCD$1$%,,S!WgH^>^__r   rI   r*   c                    t         j                  t        t               | j	                         \  }}||n|}| ddddddf   j                  |d||      j                  |      }t        j                  d|      |z
  }|j                  |j                  t        j                        t        j                  |      j                        S )zg
        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
        Nr         ?r;   )r   r   r   r   r?   rG   r.   r1   tensorr/   r0   r2   r3   )rI   r   r*   rH   src_lenexpanded_maskinverted_masks          r   r-   z#AttentionMaskConverter._expand_mask   s    
 	/?yy{W$0'gQdA-.55c1gwORRSXYS6F(()9)9%**)Eu{{SXGYG]G]^^r   rP   	min_dtypec                     t         j                  t        t               | j                  t
        j                  k(  rt        d      | j                  t        j                  | |k(  dd             S )a  
        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
        Details: https://github.com/pytorch/pytorch/issues/110213

        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
        `attention_mask` is [bsz, src_seq_len].

        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

        For example, if `expanded_mask` is (e.g. here left-padding case)
        ```
        [[[[0, 0, 0],
           [0, 0, 0],
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[0, 0, 0],
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        then the modified `expanded_mask` will be
        ```
        [[[[1, 1, 1],   <-- modified
           [1, 1, 1],   <-- modified
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[1, 1, 1],   <-- modified
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        z\AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor.r    T)r:   keepdim)
r   r   r   r   r   r1   r0   r   mulall)rP   rR   s     r   _unmask_unattendedz)AttentionMaskConverter._unmask_unattended   sc    R 	/? %**,n    %))MY,FBX\"]!]^^r   attention_maskinputs_embedsis_trainingc                 d   t         j                  t        t               |j                  d   |j                  d   }}||z   }t        |      }d}	| |s|s|dk(  s||k(  r	|||k  rd}	|	S |||k  r?t        | j                        dk(  ry|s$t        j                  | dk(        r|dk(  s||k(  rd}	|	S )a9  
        Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
        ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.

        In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
        `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
        allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
        passed).
        r   r   FT   )	r   r   r   r   r+   r   lenr1   rV   )
rX   rY   r"   r   rZ   _r   r   is_tracing_ignore_causal_masks
             r   _ignore_causal_mask_sdpaz/AttentionMaskConverter._ignore_causal_mask_sdpa  s    " 	/?'--a0-2E2Ea2H<'*@@ /"! K!Q&*:l*J#+/?./P%)" "! #'7.'H>''(A- UYY~/B%C1$(8L(H)-& "!r   N)cpur   N)NF)__name__
__module____qualname____doc__r0   __annotations__intr   r1   r   r   r   Tensorr'   r6   staticmethodSizer$   r-   FloatTensorfloatrW   ra    r   r   r
   r
   &   s   < O	$ 	d
 	" .3  	
 {{ ellE)* 
	L (,-  <<-  -  {{	- 
 *-  
- ^ 
 '(%)!`!`{{!` !` !$	!`
 d
!` !`F _5<< _ _cDj _ _ 0_((0_0_ 0_d 
 &*!8"t+8"||8" !$8" d
	8"
 8" 
8" 8"r   r
   rX   r%   rY   r"   r   c                    t        d|      }|d   |z   }| <t        | j                        dk(  r$|j                  | |d   ||j                        } | S | t        | j                        dk(  r|d   d|d   |f}t        | j                        |k7  r%t        d	t        | j                         d
| d      d| z
  }|j                  |j                  t        j                        t        j                  |j                        j                        } | S |j                  |d   |d   ||j                  |j                        } | S )a  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        attention_mask (`torch.Tensor` or `None`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        inputs_embeds (`torch.Tensor`):
            The embedded inputs as a torch Tensor.
        past_key_values_length (`int`):
            The length of the key value cache.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Tr   r   r       )r   r   r\   r   r   z#Incorrect 4D attention_mask shape: z; expected: .rM   r9   )r
   r]   r+   r6   r   tupler   r/   r.   r1   r0   r2   r3   r'   r   )	rX   r%   rY   r"   r   attn_mask_converterr   expected_shaperQ   s	            r   !_prepare_4d_causal_attention_maskrx   G  sn   . 14P^_"2)?? !c.*>*>&?1&D,22KO>NVcViVi 3 
( # 
	#N,@,@(AQ(F%a.![^=MN%%&.85eN<P<P6Q5RR^_m^nnop 
  .0M*66  ,ekk-:M:M.N.R.RN 	 -99NKO-=]EXEXanauau : 
 r   c                    t        d|      }|d   |z   }t        |      }t         j                  | |||      }|rd}	|	S | 2|j                  |d   |d   ||j                  |j
                        }	|	S | j                         dk(  r| }	n"|j                  | |d   |j                  |	      }	|sV|	j
                  j                  d
v r>t         j                  |	t        j                  |j                        j                        }	|	S )a  
    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.

    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
    Trr   r    )rX   rY   r"   r   Nr   r9   r\   )r   r   )cudaxpu)rR   )r
   r   ra   r'   r   r   r:   r6   typerW   r1   r2   r3   )
rX   r%   rY   r"   r   rv   r   r_   r`   r5   s
             r   *_prepare_4d_causal_attention_mask_for_sdpar}   |  sB    14P^_"2)??
 ]+K/HH%#5%	 I  0 / 
	.;;NKO-=]EXEXanauau < 
, % 1$-288B#))!1	  9   /66;;N5HH EKK8K8K,L,P,P  I   r   rI   r   r*   c                 2    t         j                  | ||      S )  
    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        mask (`torch.Tensor`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        tgt_len (`int`):
            The target length or query length the created mask shall have.
    rI   r   r*   )r
   r-   r   s      r   _prepare_4d_attention_maskr     s     "..Dw.WWr   c                     t         j                  t        t               | j                  \  }}||n|}t        |       st        j                  | dk(        ryt        j                  | ||      S )r   Nr   r   )
r   r   r   r   r+   r   r1   rV   r
   r-   )rI   r   r*   r^   r   s        r   #_prepare_4d_attention_mask_for_sdpar     sf     +];**A ,g2BG d		$!) 4%22ESZ2[[r   r   r   c                 h    t        d|      }|| d   z   }|j                  | d   | d   |||      }|S )a/  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`

    Args:
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        device (`int`):
            The torch device the created mask shall have.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Trr   r    r   r9   )r
   r'   )r%   r   r   r"   r   rv   r   rX   s           r    _create_4d_causal_attention_maskr     sR    ( 14P^_-B?(55AB)9v 6 N r   rb   rd   )rh   dataclassesr   typingr   r1   utilsr   utils.import_utilsr   r   
get_loggerre   r   r   r
   rk   rm   ru   listrj   rx   r}   r   r   r   r   r   rp   r   r   <module>r      s   "    D 
		H	%w  ]" ]" ]"J	 "&1LL4'1e#d*1 <<1  	1
 $J1t "&7LL4'7e#d*7 <<7  	7
 $J7tXU\\ X%++ XPSVZPZ X \ell \5;; \Y\_cYc \: #$!%e#d*;; LL  	
 $J \\Dr   