
    謜i"                       U d dl Z d dlmZ d dlZd dlmc mZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZmZmZ  e       rd d
lmZ d dlmZmZ nej:                  Z edd      Z edd      Z e       Z erd dl!m"Z"  ejF                  e$      Z%dedefdZ&dedefdZ'de(de(de(de(de)f
dZ*de(de(de(de(de)f
dZ+de(defdZ,de(dej:                  defdZ-de(defd Z.de(defd!Z/de(defd"Z0de(dej:                  defd#Z1d$ej:                  defd%Z2d&ej:                  defd'Z3d(ed)e(d*e(defd+Z4d,ej:                  dz  d-e(d*e(dej:                  dz  fd.Z5d$ej:                  dz  d/e(d-e(d0e(dz  de)f
d1Z6	 dld$ej:                  dz  d/e(d-e(d*e(d0e(dz  de)fd2Z7d$ej:                  dz  d-e(d0e(dz  de)fd3Z8	 dld$ej:                  dz  d-e(d0e(dz  de)fd4Z9d(edefd5Z:d6ej:                  d7ej:                  d8ej:                  d9ej:                  fd:Z;d e*dddd;dd;fd<e(d=ej:                  d-e(d*e(d(ed,ej:                  dz  d>e(dz  d?e)d@e)dAe)dBe)dej:                  dz  fdCZ<d e*dejz                  d;d;fd<e(d=ej:                  d-e(d*e(d(ed,ej:                  dz  dDej|                  d@e)dBe)dej:                  fdEZ?d e*dfd<e(d=ej:                  d-e(d*e(d(ed,ej:                  dz  fdFZ@d e*dfd<e(d=ej:                  d-e(d*e(d(ed,ej:                  dz  defdGZA G dH dIe      ZB eB       ZCeBeDdJ<   dKej:                  dej:                  dz  fdLZE edMdNdOP      dQedOej:                  d,ej:                  ez  dz  d=ej:                  dRe	dz  dKej:                  dz  dSe(dz  deFe)ej:                  ez  dz  e(e(f   fdT       ZG edMdNdOP      	 	 	 dmdQedOej:                  d,ej:                  dz  d=ej:                  dRe	dz  dKej:                  dz  dUedz  dVedz  dej:                  ez  dz  fdW       ZH edMdNdOP      	 	 	 dmdQedOej:                  d,ej:                  dz  dXej:                  dz  dUedz  dVedz  dej:                  ez  dz  fdY       ZI edMdNdOP      	 	 	 dmdQedOej:                  d,ej:                  dz  d=ej:                  dRe	dz  dKej:                  dz  dUedz  dVedz  dej:                  ez  dz  fdZ       ZJ edMdNdOP      	 	 dndQedOej:                  d,ej:                  dz  dUedz  dVedz  dej:                  ez  dz  fd[       ZK edMdNdOP      	 	 	 dmdQedOej:                  d,ej:                  dz  d=ej:                  dRe	dz  dKej:                  dz  dUedz  dVedz  dej:                  ez  dz  fd\       ZLeHeJeLd]ZM edMdNdOP      	 	 	 dmdQedOej:                  d,ej:                  dz  d=ej:                  dRe	dz  dKej:                  dz  dUedz  dVedz  fd^       ZNd_ZOd`ZPdaZQdbZRdcZSddZTdeZUdfZVdg ZWeP eR eQ ZXeO eR eQ ZYdodhej:                  deZfdiZ[ G dj dkej:                        Z\y)p    N)Callable   )Cache)PreTrainedConfig)is_torch_xpu_availablelogging)deprecate_kwarg)GeneralInterfaceis_flash_attention_requested)is_torch_flex_attn_availableis_torch_greater_or_equal
is_tracing)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskz2.5T)
accept_devz2.6)TransformGetItemToIndexmask_functionsreturnc                  R     t        d  D              st        d         fd}|S )zKReturns a mask function that is the intersection of provided mask functionsc              3   2   K   | ]  }t        |        y wNcallable.0args     V/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/masking_utils.py	<genexpr>zand_masks.<locals>.<genexpr>0        7x}7   .All inputs should be callable mask_functions: c                     |j                  dt        j                        }D ])  }| || |||      j                  |j                        z  }+ |S N )dtype)new_onestorchbooltodevice	batch_idxhead_idxq_idxkv_idxresultmaskr   s         r   and_maskzand_masks.<locals>.and_mask3   sS    %**5" 	YDd9hvFII&--XXF	Y    allRuntimeError)r   r3   s   ` r   	and_masksr8   .   s1    777KNK[\]] Or4   c                  R     t        d  D              st        d         fd}|S )zDReturns a mask function that is the union of provided mask functionsc              3   2   K   | ]  }t        |        y wr   r   r   s     r   r   zor_masks.<locals>.<genexpr>>   r    r!   r"   c                     |j                  dt        j                        }D ])  }| || |||      j                  |j                        z  }+ |S r$   )	new_zerosr(   r)   r*   r+   r,   s         r   or_maskzor_masks.<locals>.or_maskA   sS    5::6" 	YDd9hvFII&--XXF	Yr4   r5   )r   r=   s   ` r   or_masksr>   <   s1    777KNK[\]] Nr4   r-   r.   r/   r0   c                     ||k  S )z:
    This creates a basic lower-diagonal causal mask.
    r%   r-   r.   r/   r0   s       r   causal_mask_functionrA   J   s     U?r4   c                     |dk\  S )z
    This creates a full bidirectional mask.

    NOTE: It is important to keep an index-based version for non-vmap expansion.
    r   r%   r@   s       r   bidirectional_mask_functionrC   Q   s     A:r4   sliding_windowc           
      P     dt         dt         dt         dt         dt        f
 fd}|S )z
    This is an overlay depicting a sliding window pattern. Add it on top of a causal mask for a proper sliding
    window mask.
    r-   r.   r/   r0   r   c                     ||z
  kD  S r   r%   r-   r.   r/   r0   rD   s       r   
inner_maskz*sliding_window_overlay.<locals>.inner_mask`   s    ...r4   intr)   rD   rH   s   ` r   sliding_window_overlayrL   Z   s3    /c /S / /c /d / r4   
chunk_sizeleft_paddingc           
      T     dt         dt         dt         dt         dt        f
 fd}|S )z
    This is an overlay depicting a chunked attention pattern. Add it on top of a causal mask for a proper chunked
    attention mask.
    r-   r.   r/   r0   r   c                 2    ||    z
  z  ||    z
  z  k(  S r   r%   )r-   r.   r/   r0   rM   rN   s       r   rH   z#chunked_overlay.<locals>.inner_maskl   s.    i00Z?ELYbLcDchrCrrrr4   rI   )rM   rN   rH   s   `` r   chunked_overlayrQ   f   s9    sc sS s sc sd s r4   c                 4    t        t        |       t              S )zQ
    This return the mask_function function to create a sliding window mask.
    )r8   rL   rA   rD   s    r   #sliding_window_causal_mask_functionrT   r   s     +N;=QRRr4   c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zN
    This is an overlay depicting a bidirectional sliding window pattern.
    r-   r.   r/   r0   r   c                 &    t        ||z
        k  S )zA token can attend to any other token if their absolute distance is within
        the (inclusive) sliding window size (distance <= sliding_window).)absrG   s       r   rH   z8sliding_window_bidirectional_overlay.<locals>.inner_mask~   s     56>"n44r4   rI   rK   s   ` r   $sliding_window_bidirectional_overlayrX   y   s3    
5c 5S 5 5c 5d 5
 r4   c                 4    t        t        |       t              S )z_
    This return the mask_function function to create a bidirectional sliding window mask.
    )r8   rX   rC   rS   s    r   *sliding_window_bidirectional_mask_functionrZ      s     9.IKfggr4   c                 6    t        t        | |      t              S )zT
    This return the mask_function function to create a chunked attention mask.
    )r8   rQ   rA   )rM   rN   s     r   chunked_causal_mask_functionr\      s     _Z>@TUUr4   padding_maskc           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zT
    This return the mask_function function corresponding to a 2D padding mask.
    r-   r.   r/   r0   r   c                     | |f   S r   r%   )r-   r.   r/   r0   r]   s       r   rH   z)padding_mask_function.<locals>.inner_mask   s     Iv-..r4   rI   )r]   rH   s   ` r   padding_mask_functionr`      s3    
/c /S / /c /d / r4   packed_sequence_maskc           
      P     dt         dt         dt         dt         dt        f
 fd}|S )z\
    This return the mask_function function corresponding to a 2D packed sequence mask.
    r-   r.   r/   r0   r   c                 "    | |f   | |f   k(  S r   r%   )r-   r.   r/   r0   ra   s       r   rH   z1packed_sequence_mask_function.<locals>.inner_mask   s$    #Iu$459MiY_N_9```r4   rI   )ra   rH   s   ` r   packed_sequence_mask_functionrd      s9    
ac aS a ac ad a r4   mask_functionq_offset	kv_offsetc           
      X     dt         dt         dt         dt         dt        f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    r-   r.   r/   r0   r   c                 &     | ||z   |z         S r   r%   )r-   r.   r/   r0   rg   re   rf   s       r   rH   z0add_offsets_to_mask_function.<locals>.inner_mask   s    Y%(2BFYDVWWr4   rI   )re   rf   rg   rH   s   ``` r   add_offsets_to_mask_functionrj      s9    Xc XS X Xc Xd X r4   attention_mask	kv_lengthc                     | }| F||z   | j                   d   z
  x}dkD  r,t        j                  j                  j	                  | d|f      }|S )zh
    From the 2D attention mask, prepare the correct padding mask to use by potentially padding it.
    r   )shaper(   nn
functionalpad)rk   rl   rg   local_padding_maskpadding_lengths        r   prepare_padding_maskru      s[     (!')3n6J6J26NNNNRSS!&!4!4!8!8!^I\!]r4   query_lengthlocal_attention_sizec                     t        |       ry|||k\  ry| |dk(  xs ||k(  S |dk(  r| j                         S | ddd|f   j                         xr | dd|df   j                          S )ar  
    XPU-specific logic for determining if we can skip causal mask creation.

    For XPU devices, we have special handling:
    - Single query tokens (query_length == 1) use the same logic as CUDA
    - Multi-query tokens can skip if padding_mask is provided and correctly structured
      The mask must have all True values in the query window and all False after
    FNr   )r   r6   any)r]   rv   rl   rw   s       r   _can_skip_causal_mask_xpurz      s     , 'I9M,Mq =I$== q!! =L=()--/\QEU8V8Z8Z8\4\\r4   c                    | A| j                   d   |kD  r/t        j                  || j                        }||z  }| dd|f   } t        rt        | |||      S t        |       s$|dk(  s||k(  r|||k  r| | j                         ryy)a  
    Detects whether the causal mask can be ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.

    In case no token is masked in the 2D `padding_mask` argument, if `query_length == 1` or
    `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
    passed).
    Nrn   r+   r   TF)ro   r(   aranger+   _is_torch_xpu_availablerz   r   r6   )r]   rv   rl   rg   rw   mask_indicess         r   _ignore_causal_mask_sdpar      s     L$6$6r$:Y$F||Il6I6IJ	!#A|O4
 )|YPdee |$Q)|";!)Y9M-M!\%5%5%7r4   c                 P    t        |       ry|||k\  ry| y| j                         S )z
    XPU-specific logic for determining if we can skip bidirectional mask creation.

    For XPU devices, we have special handling:
    - Skip if no padding and no local attention constraint
    FT)r   r6   r]   rl   rw   s      r    _can_skip_bidirectional_mask_xpur     s<     , 'I9M,M r4   c                 t    t         rt        | ||      S t        |       s| | j                         r|||k  ryy)a  
    Detects whether the bidirectional mask can be ignored in case PyTorch's SDPA is used.

    In case no token is masked in the 2D `padding_mask` argument and no local attention constraint applies
    (i.e. `local_attention_size` is None or `kv_length < local_attention_size`), we skip mask creation,
    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
    passed).
    TF)r~   r   r   r6   r   s      r   _ignore_bidirectional_mask_sdpar   0  sH      0iI]^^
 |$!\%5%5%7!)Y9M-Mr4   c                 L    g d}|D ]  }t        j                  | |d      }  | S )aJ  
    Used to vmap our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
    Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive
    functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different).
    ))NNNr   )NNr   N)Nr   NN)r   NNNr   )in_dimsout_dims)r(   vmap)re   
dimensionsdimss      r   _vmap_expansion_sdpar   O  s3     nJ L

=$KLr4   batch_indiceshead_indices	q_indices
kv_indicesc                 f    | dddddf   } |dddddf   }|dddddf   }|dddddf   }| |||fS )a  
    Used to broadcast our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
    Allows the usage of any index-based mask function without relying on vmap.

    NOTE: This is limited to index based functions only and is not guaranteed to work otherwise.

    Reference:
        - https://github.com/huggingface/optimum-onnx/blob/c123e8f4fab61b54a8e0e31ce74462bcacca576e/optimum/exporters/onnx/model_patcher.py#L362-L365
    Nr%   )r   r   r   r   s       r   _non_vmap_expansion_sdpar   \  s_     "!T4"56Mat 34L$a-.ID$a/0J,	:==r4   F
batch_sizecache_position
local_sizeallow_is_causal_skipallow_is_bidirectional_skipallow_torch_fixuse_vmapc                    |j                   d   }t        |||      }|rt        |||||      ry|rt        |||      ry|t	        |t        |            }t        j                  | |j                        }t        j                  d|j                        }t        j                  ||j                        |z   }|
s& |t        ||||       }|j                  | d||      }n9t        r(t               5   t        |      ||||      }ddd       nt        d      t        s|	r|t        j                   | dd      z  }|S # 1 sw Y   /xY w)	u  
    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
    the element should take part in the attention computation, and False that it should not.
    This function can only be used with torch>=2.5, as the context manager is otherwise not available.

    Args:
        batch_size (`int`):
            The batch size of the input sequence.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        kv_length (`int`):
            The size that the key and value states will have during the attention computation.
        kv_offset (`int`, optional):
            An optional offset to indicate at which first position the key and values states will refer to.
        mask_function (`Callable`):
            The mask factory function describing the mask pattern.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
        local_size (`int`, optional):
            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
            to try to skip mask creation if possible.
        allow_is_causal_skip (`bool`, optional):
            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
            `torch.sdpa` instead. Default to `True`.
        allow_is_bidirectional_skip (`bool`, optional):
            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
            i.e. full attention without any padding. Default to `False`.
        allow_torch_fix (`bool`, optional):
            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
            versions. We need an arg to skip it when using eager. By default `True`.
        use_vmap (`bool`, optional):
            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
            index-based (for the cost of speed performance). By default `False`.


    ## Creating a simple causal mask:

    To create the following causal mask:

        0 ■ ⬚ ⬚ ⬚ ⬚
        1 ■ ■ ⬚ ⬚ ⬚
        2 ■ ■ ■ ⬚ ⬚
        3 ■ ■ ■ ■ ⬚
        4 ■ ■ ■ ■ ■

    You can do

    ```python
    >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5)
    >>> tensor([[[[ True, False, False, False, False],
                  [ True,  True, False, False, False],
                  [ True,  True,  True, False, False],
                  [ True,  True,  True,  True, False],
                  [ True,  True,  True,  True,  True]]]])
    ```

    ## Creating a sliding window mask:

    To create the following sliding window mask (`sliding_window=3`):

        0 ■ ⬚ ⬚ ⬚ ⬚
        1 ■ ■ ⬚ ⬚ ⬚
        2 ■ ■ ■ ⬚ ⬚
        3 ⬚ ■ ■ ■ ⬚
        4 ⬚ ⬚ ■ ■ ■

    You can do

    ```python
    >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=sliding_window_causal_mask_function(3))
    >>> tensor([[[[ True, False, False, False, False],
                  [ True,  True, False, False, False],
                  [ True,  True,  True, False, False],
                  [False,  True,  True,  True, False],
                  [False, False,  True,  True,  True]]]])
    ```

    ## Creating a chunked attention mask

    To create the following chunked attention mask (`chunk_size=3`):

        0 ■ ⬚ ⬚ ⬚ ⬚
        1 ■ ■ ⬚ ⬚ ⬚
        2 ■ ■ ■ ⬚ ⬚
        3 ⬚ ⬚ ⬚ ■ ⬚
        4 ⬚ ⬚ ⬚ ■ ■

    You can do

    ```python
    >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=chunked_causal_mask_function(3, torch.zeros(1, dtype=int)))
    >>> tensor([[[[ True, False, False, False, False],
                [ True,  True, False, False, False],
                [ True,  True,  True, False, False],
                [False, False, False,  True, False],
                [False, False, False,  True,  True]]]])
    ```

    r   Nr|   r   rn   zThe vmap functionality for mask creation is only supported from torch>=2.6. Please update your torch version or use `use_vmap=False` with index-based masks.T)dimkeepdim)ro   ru   r   r   r8   r`   r(   r}   r+   r   expand#_is_torch_greater_or_equal_than_2_6r   r   
ValueError#_is_torch_greater_or_equal_than_2_5r6   )r   r   rl   rg   re   rk   r   r   r   r   r   kwargsq_lengthr]   batch_arangehead_arange	kv_aranges                    r   	sdpa_maskr   o  sl   b ##A&H (	9ML
  8xQZ\egq r"'F|U^`j'k !-1F|1TU<<
>3H3HIL,,q)>)>?K Y~/D/DE	QI &(@{\jlu(vw'..z2xS 
- %& 	w@1-@{\jluvN	w 	w
 _
 	
 /?'%))^OUY*ZZ	w 	ws   4EEr&   c	                 *   |	j                  dd      }
|	j                  dd      }
t        d| |||||d|d|d
|	}|Vt        j                  |      j                  }t        j
                  |t        j                  d|j                  |      |      }|S )	aX  
    Create a 4D float mask of shape `(batch_size, 1, query_length, kv_length)` where a value of 0 indicates that
    the element should take part in the attention computation, and -inf (minimum value for the given `dtype`) that
    it should not.

    Args:
        batch_size (`int`):
            The batch size of the input sequence.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        kv_length (`int`):
            The size that the key and value states will have during the attention computation.
        kv_offset (`int`, optional):
            An optional offset to indicate at which first position the key and values states will refer to.
        mask_function (`Callable`):
            The mask factory function describing the mask pattern.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
        dtype (`torch.dtype`, optional):
            The dtype to use for the mask. By default, `torch.float32`.
        allow_is_bidirectional_skip (`bool`, optional):
            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
            i.e. full attention without any padding. Default to `False`.
        use_vmap (`bool`, optional):
            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
            index-based (for the cost of speed performance). By default `False`.
    r   Nr   F)
r   r   rl   rg   re   rk   r   r   r   r   g        r+   r&   r%   )popr   r(   finfominwheretensorr+   )r   r   rl   rg   re   rk   r&   r   r   r   _r2   	min_dtypes                r   
eager_maskr     s    P 	

)40A

$d+A %#%"$? D KK&**	{{4c$++U!SU^_Kr4   c                 F    ||dd| df   }|j                         rd}|S )a"  
    Create the attention mask necessary to use FA2. Since FA2 is un-padded by definition, here we simply return
    `None` if the mask is fully causal, or we return the 2D mask which will then be used to extract the seq_lens.
    We just slice it in case of sliding window.

    Args:
        batch_size (`int`):
            The batch size of the input sequence.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        kv_length (`int`):
            The size that the key and value states will have during the attention computation.
        kv_offset (`int`, optional):
            An optional offset to indicate at which first position the key and values states will refer to.
        mask_function (`Callable`):
            The mask factory function describing the mask pattern.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
    N)r6   )r   r   rl   rg   re   rk   r   s          r   flash_attention_maskr   U  s5    8 !'I:;7 !Nr4   c           	         |j                   d   |d   }}||j                   d   t        z  dz   t        z  }	|	|j                   d   z
  }	t        s3|	dkD  r.t        j                  j
                  j                  |dd|	f      }t        |||      }
t        |t        |
            }t        |||      }t        || d|||j                  t              }|S )a  
    Create a 4D block mask which is a compressed representation of the full 4D block causal mask. BlockMask is essential
    for performant computation of flex attention. See: https://pytorch.org/blog/flexattention/

    Args:
        batch_size (`int`):
            The batch size of the input sequence.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        kv_length (`int`):
            The size that the key and value states will have during the attention computation.
        kv_offset (`int`, optional):
            An optional offset to indicate at which first position the key and values states will refer to.
        mask_function (`Callable`):
            The mask factory function describing the mask pattern.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
    r   Nr   )valuerr   )mask_modBHQ_LENKV_LENr+   _compile)ro   flex_default_block_sizer   r(   rp   rq   rr   ru   r8   r`   rj   r   r+   )r   r   rl   rg   re   rk   r   r   rf   pad_lenr]   
block_masks               r   flex_attention_maskr   |  s    6 (--a0.2ChH ! #((+/FF!KOffN00332w{"XX0044^1STV]R^4_N+NIyQ!-1F|1TU 1)TM #

$$4J r4   c                       e Zd ZeeeeedZy)AttentionMaskInterface)sdpaeagerflash_attention_2flash_attention_3flex_attentionN)__name__
__module____qualname__r   r   r   r   _global_mappingr%   r4   r   r   r     s     11-Or4   r   ALL_MASK_ATTENTION_FUNCTIONSposition_idsc                     | ddddf   dz
  }t        j                  | |d      }|dk7  j                  d      }t        |      s|dddf   dk(  j	                         ry|S )a>  
    Find the indices of the sequence to which each new query token in the sequence belongs when using packed
    tensor format (i.e. several sequences packed in the same batch dimension).

    Args:
        position_ids (`torch.Tensor`)
            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.

    Returns:
        A 2D tensor where each similar integer indicates that the tokens belong to the same sequence. For example, if we
        pack 3 sequences of 2, 3 and 1 tokens respectively along a single batch dim, this will return [[0, 0, 1, 1, 1, 2]].

        If the there is only one sequence in each batch item (and we don't compile), then we return `None` indicating
        no packed sequences. This is the same as [[0, 0, 0, 0, 0, 0]] for the example above.
    Nr   rn   )prependr   r   )r(   diffcumsumr   r6   )r   first_dummy_valueposition_diffra   s       r   find_packed_sequence_indicesr     st    * %QU+a/JJ|5FBOM)Q.66r: *+1Eae1LPQ1Q0V0V0Xr4   input_embedsz5.6.0inputs_embeds)versionnew_nameconfigpast_key_values	layer_idxc                 F   t        |t        j                  t        f      rt	        |j
                        dk(  rd|dddfS | j                  t        j                  vry|:|j                  dk(  r+|j                  |j                  t        j                        }||j                  ||      \  }}n%||j
                  d   d}}n|j
                  d	   d}}d}	|B|@|>|j
                  d   }
|
|j
                  d   k7  r|j                  |
d	      }t        |      }	d
||	||fS )as  
    Perform some common pre-processing of the mask arguments we get from the modeling code. Mostly determine the
    key-value length and offsets, and if we should early exit or not.

    Args:
        config (`PreTrainedConfig`):
            The model config.
        inputs_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
            batch size, query length and dtype.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
            It can also be an already prepared 4D mask, in which case it is returned as-is.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        past_key_values (`Cache`, optional):
            The past key values, if we use a cache.
        position_ids (`torch.Tensor`, optional)
            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
        layer_idx (`int`, optional):
            If `past_key_values` is not None, this is the layer index of the cache from which to get the key-value
            length and offset. Indeed, for hybrid caches, different layers may return different lengths.

    Returns:
        early_exit (`bool`):
            Whether we should early exit mask creation, and return the mask as-is.
        attention_mask (`torch.Tensor` or `BlockMask` or `None`):
            The attention mask to either return immediately, or to use in downstream mask creation.
        packed_sequence_mask (`torch.Tensor`, optional):
            In case we detected packed sequence format, this is a tensor where each similar integer indicates that
            the tokens belong to the same sequence.
        kv_length (`int`):
            The size that the key and value states will have during the attention computation.
        kv_offset (`int`):
            An offset to indicate at which first position the key and values states will refer to.
       TN)TNNNN   r   r   r   rn   F)
isinstancer(   Tensorr   lenro   _attn_implementationr   r   ndimr*   r+   r)   get_mask_sizesr   r   )r   r   rk   r   r   r   r   rl   rg   ra   r   s              r   _preprocess_mask_argumentsr     sE   ^ .5<<";<^EYEYAZ^_A_^T455 ""*F*V*VV+ !n&9&9Q&>'**.2G2Guzz*Z ".==niX	9 !#0#6#6q#91yI $2#7#7#;QyI  N$:?V"((+
++A..'..z2>L;LI."6	9LLr4   or_mask_functionand_mask_functionc                    t        | dd      st        | ||||      S t        |d      r*d|j                  v r|j                  j	                  d      }nd}t        | ||||||      \  }	}}
}}|	r|S |j                  d   |j                  }}t        }t        | j                     }d}t        r#t        |dd      xr |j                  d   dk(   }nt        |dd       }|!t        st        d	      t        ||      }d}d}|!t        st        d	      t        ||      }d}d}|
t        |t!        |
            }d} |||||||||| |

      }|S )a.  
    Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
    has an hybrid cache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
    to what is needed in the `modeling_xxx.py` files).

    Args:
        config (`PreTrainedConfig`):
            The model config.
        inputs_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
            batch size, query length and dtype.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
            It can also be an already prepared 4D mask, in which case it is returned as-is.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        past_key_values (`Cache`, optional):
            The past key values, if we use a cache.
        position_ids (`torch.Tensor`, optional)
            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
        or_mask_function (`Callable`, optional):
            An optional mask function to combine with the causal mask function (by doing the union of both). This is
            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
        and_mask_function (`Callable`, optional):
            An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
    	is_causalTr   r   
is_slidingFr   is_compileabler   LUsing `or_mask_function` or `and_mask_function` arguments require torch>=2.6)
r   r   rl   rg   re   rk   r   r&   r   r   )getattrcreate_bidirectional_maskhasattrr   indexr   ro   r&   rA   r   r   r~   r   r   r>   r8   rd   )r   r   rk   r   r   r   r   r   r   
early_exitra   rl   rg   r   r&   mask_factory_functionmask_interfacer   r   causal_masks                       r   create_causal_maskr   A  s   P 6;-(-/
 	
 -%?;U;U2U#..44U;		Mg~~P\^gNJJ 4i %++A.0C0CJ01&2M2MNN
 H $+O=Mu$U$vZhZnZnopZquvZvw#*?<Le#TT
 #2kll ()>@P Q$$2kll )*?AR S$ ' )*?A^_sAt u$ !%+%1K r4   encoder_hidden_statesc                    t        j                  |j                  d   |j                  t         j                        }||n|}t        | |||ddd      \  }}}	}
}|r|S |j                  d   |j                  }}t        }t        | j                     }d}d}|!t        st        d      t        ||      }d}d}|!t        st        d      t        ||      }d}d} ||||
|||d||| |      }|S )	a  
    Create a standard bidirectional mask based on the attention implementation used (stored in the config).

    Args:
        config (`PreTrainedConfig`):
            The model config.
        inputs_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is only used to infer metadata
            such as the batch size, query length, dtype, and device.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, kv_length).
            It can also be an already prepared 4D mask of shape (batch_size, 1, query_length, kv_length),
            in which case it is returned as-is.
        encoder_hidden_states (`torch.Tensor`, optional):
            The input embeddings of shape (batch_size, kv_length, hidden_dim). If provided, it is used instead of
            `inputs_embeds` to infer the batch size, kv length and dtype.
        or_mask_function (`Callable`, optional):
            An optional mask function to combine with the base mask function (by doing the union of both). This is
            useful to easily overlay another mask on top, for example for image tokens handling.
        and_mask_function (`Callable`, optional):
            An optional mask function to combine with the base mask function (by doing the intersection of both). This is
            useful to easily overlay another mask on top, for example for image tokens handling.
    r   r   Nr   TFr   )r   r   rl   rg   re   rk   r   r   r&   r   r   )r(   r}   ro   r+   longr   r&   rC   r   r   r   r   r>   r8   )r   r   rk   r   r   r   r   embedsr   r   rl   rg   r   r&   r   r   r   r   s                     r   r   r     s:   D \\-"5"5a"8AUAU]b]g]ghN&;&G"]F:TdA;7J9i QJ71&2M2MNN #' H
 #2kll ()>@P Q&+#$2kll )*?AR S&+# $%+%"$?N r4   c                    t        | dd      st        | ||||      S t        |d      r*d|j                  v r|j                  j	                  d      }nd}t        | ||||||      \  }	}}
}}|	r|S t        | dd      }|t        d      |j                  d   |j                  }}t        |      }t        | j                     }d	}t        |d
d	       }|!t        st        d      t        ||      }d	}d}|!t        st        d      t        ||      }d	}d}|
t        |t        |
            }d	} ||||||||||| |      }|S )a  
    Create a sliding window causal mask based on the attention implementation used (stored in the config). This type
    of attention pattern was mostly democratized by Mistral. If `past_key_values` has an hybrid cache structure, this
    function will return the mask corresponding to one of the "sliding_attention" layers (to align to what is needed in the
    `modeling_xxx.py` files).

    Args:
        config (`PreTrainedConfig`):
            The model config.
        inputs_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
            batch size, query length and dtype.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
            It can also be an already prepared 4D mask, in which case it is returned as-is.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        past_key_values (`Cache`, optional):
            The past key values, if we use a cache.
        position_ids (`torch.Tensor`, optional)
            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
        or_mask_function (`Callable`, optional):
            An optional mask function to combine with the sliding causal mask function (by doing the union of both). This is
            useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
        and_mask_function (`Callable`, optional):
            An optional mask function to combine with the sliding causal mask function (by doing the intersection of both). This is
            useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
    r   Tr   r   r   rD   NJCould not find a `sliding_window` argument in the config, or it is not setFr   r   r   r   rl   rg   re   rk   r   r   r&   r   r   )r   (create_bidirectional_sliding_window_maskr   r   r   r   r   ro   r&   rT   r   r   r   r>   r8   rd   )r   r   rk   r   r   r   r   r   r   r   ra   rl   rg   rD   r   r&   r   r   r   r   r   s                        r   !create_sliding_window_causal_maskr     s   R 6;-7-/
 	
 -$/:T:T2T#..44T:		Mg~~P\^gNJJ 4i V%5t<Neff%++A.0C0CJ?O1&2M2MNN
 H  '8H%PP
 #2kll ()>@P Q$$2kll )*?AR S$ ' )*?A^_sAt u$ !%+%1!K r4   c                    t        j                  |j                  d   |j                  t         j                        }t        | |||ddd      \  }}}}}	|r|S t        | dd      }
|
t        d      |j                  d   |j                  }}t        |
      }t        | j                     }d}d}|!t        st        d	      t        ||      }d}d}|!t        st        d	      t        ||      }d}d} |||||	||d||
|| |
      }|S )a  
    Create a standard bidirectional sliding window mask based on the attention implementation used (stored in the config).

    Args:
        config (`PreTrainedConfig`):
            The model config.
        inputs_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is only used to infer metadata
            such as the batch size, query length, dtype, and device.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, kv_length).
            It can also be an already prepared 4D mask of shape (batch_size, 1, query_length, kv_length),
            in which case it is returned as-is.
        or_mask_function (`Callable`, optional):
            An optional mask function to combine with the base mask function (by doing the union of both). This is
            useful to easily overlay another mask on top, for example for image tokens handling.
        and_mask_function (`Callable`, optional):
            An optional mask function to combine with the base mask function (by doing the intersection of both). This is
            useful to easily overlay another mask on top, for example for image tokens handling.
    r   r   Nr   rD   r   FTr   )r   r   rl   rg   re   rk   r   r   r   r&   r   r   )r(   r}   ro   r+   r   r   r   r   r&   rZ   r   r   r   r>   r8   )r   r   rk   r   r   r   r   r   rl   rg   rD   r   r&   r   r   r   r   s                    r   r   r     sU   < \\-"5"5a"8AUAU]b]g]ghN ;U~~tT1;7J9i V%5t<Neff%++A.0C0CJF~V1&2M2MNNH"&#2kll ()>@P Q&+#$2kll )*?AR S&+##%+%"$?!N r4   c                 V   t        |d      r*d|j                  v r|j                  j                  d      }nd}t        | ||||||      \  }	}}
}}|	r|S t	        | dd      }|t        d      t        |       r||z   |kD  rt        d      |j                  d   |j                  }}|9|j                  d	      t        j                  |      k(  j                  d	      }n&t        j                  ||j                  t        
      }t!        ||      }t"        | j$                     }d}t	        |dd       }|!t&        st        d      t)        ||      }d}d}|!t&        st        d      t+        ||      }d}d}|
t+        |t-        |
            }d} ||||||||||| |      }|S )a  
    Create a chunked attention causal mask based on the attention implementation used (stored in the config). This type
    of attention pattern was mostly democratized by Llama4. If `past_key_values` has an hybrid cache structure, this
    function will return the mask corresponding to one of the "chunked_attention" layers (to align to what is needed in the
    `modeling_xxx.py` files).

    Args:
        config (`PreTrainedConfig`):
            The model config.
        inputs_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
            batch size, query length and dtype.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
            It can also be an already prepared 4D mask, in which case it is returned as-is.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        past_key_values (`Cache`, optional):
            The past key values, if we use a cache.
        position_ids (`torch.Tensor`, optional)
            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
        or_mask_function (`Callable`, optional):
            An optional mask function to combine with the chunked causal mask function (by doing the union of both). This is
            useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
        and_mask_function (`Callable`, optional):
            An optional mask function to combine with the chunked causal mask function (by doing the intersection of both). This is
            useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
    r   Tr   attention_chunk_sizeNzQCould not find an `attention_chunk_size` argument in the config, or it is not setzFlash attention cannot handle chunked attention, and the key-value length is larger than the chunk size so the chunked pattern cannot be respected. You should use another `attn_implementation` when instantiating the modelrn   )r   r   Fr   r   r   )r   r   r   r   r   r   r   ro   r&   r   r(   
zeros_likesumzerosr+   rJ   r\   r   r   r   r>   r8   rd   )r   r   rk   r   r   r   r   r   r   r   ra   rl   rg   rM   r   r&   left_padding_tokensr   r   r   r   r   s                         r   create_chunked_causal_maskr    s   P -$/:T:T2T#..44T:		Mg~~P\^gNJJ 4i !7>Jlmm $F+	I0E
0R}
 	

 &++A.0C0CJ !-444<@P@PQ_@``eejlem#kk*^=R=RZ]^8EXY1&2M2MNN
 H  '8H%PP
 #2kll ()>@P Q$$2kll )*?AR S$ ' )*?A^_sAt u$ !%+%1K r4   )full_attentionsliding_attentionchunked_attentionc           	      $   | j                         }	|	|||||||d}
t        |	d      r/i }t        |	j                        D ]  }t	        |   di |
||<    |S t        |	dd      t        di |
S t        |	dd      t        di |
S t        di |
S )a  
    This function mimics how we create the masks in the `modeling_xxx.py` files, and is used in places like `generate`
    in order to easily create the masks in advance, when we compile the forwards with Static caches.

    Args:
        config (`PreTrainedConfig`):
            The model config.
        inputs_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
            batch size, query length and dtype.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
            It can also be an already prepared 4D mask, in which case it is returned as-is.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        past_key_values (`Cache`, optional):
            The past key values, if we use a cache.
        position_ids (`torch.Tensor`, optional)
            A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
        or_mask_function (`Callable`, optional):
            An optional mask function to combine with the other mask function (by doing the union of both). This is
            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
        and_mask_function (`Callable`, optional):
            An optional mask function to combine with the other mask function (by doing the intersection of both). This is
            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
    )r   r   rk   r   r   r   r   r   layer_typesrD   Nr   r%   )	get_text_configr   setr
  &LAYER_PATTERN_TO_MASK_FUNCTION_MAPPINGr   r   r  r   )r   r   rk   r   r   r   r   r   r   effective_configmask_kwargscausal_maskslayer_patterns                r   create_masks_for_generater  O  s    N --/ #&((*$,.	K / !1!=!=> 	oM*PQ^*_*nbm*nL'	o	!#3T	:	F0?;??	!#94	@	L)8K88,,,r4   z[92mz[93mz[0mu   ■u   ⬚u   ∙u   ⬕u   ⬔c                 >    | dk(  rd}d}d}d}d}nd}d}d}d	}||||fS )
Nmajongu   🀞u   🀙u   🀆u   🀛u   █u   ░u   ▙u   ▜r%   )styleBLACK_SQUAREWHITE_SQUARELOW_TRIANGLEUPPER_TRIANGLEs        r   	get_styler    sF    |^CCr4   original_tensorc           
         t        |      \  }}}}| j                  \  }}|\  }	}
||	k  r||
k  sd|z  |z  }|dkD  r%|
}t        |	t        dt	        |
|z                    }n|	}t        dt	        |	|z              }| j                  d      j                  d      }t        j                  |||f      d   }n| }g }t        |      D ]  }d}t        |      D ]  }|||f   dk(  r||z  }|||f   dk(  r||z  }#|dkD  r8|||dz
  f   dk(  r||z  };|||dz
  f   dk(  r||z  }N||||f   dk(  r|n|z  }`||||f   dk(  r|n|||f   dk(  r|n|||dz   f   dk(  r|n|z  } |j                  |        dj                  |      S )Nr   r   r   )output_size)r   r    
)r  ro   r   maxround	unsqueezeFadaptive_avg_pool2drangeappendjoin)r  	grid_sizer  r  r  r  r  hwmax_hmax_waspect_ratior   r1   irowjs                    r   tensor_to_mask_visualr1    s   ?H?O<L,n  DAqLE5I!e)1uqy!AE3q%(<"=>?AAAuU\123A !**1-77:&&vAq6B4H  F1X q 	Aad|q |#1"|#q5aQh'1,|+1q5)Q.~-vad|q/@|lR!!Q$<1, %  &ad|q0 )4:1a!e84D4I.|C	. 	c36 99Vr4   c                   h    e Zd ZddZd ZddZd Zd Zedde	j                  dedz  d	d fd
       Zy)AttentionMaskNc                 T    || _         t        j                  j                  | |d      S )NF)require_grad)r  r(   r   _make_subclass)clsdatar  s      r   __new__zAttentionMask.__new__  s%    	||**35*IIr4   c                      y r   r%   )selfr8  s     r   __init__zAttentionMask.__init__  s    r4   c           
          | }|j                   ^ }}}g }t        t        j                  |D cg c]  }t	        |       c}       D ]k  \  }	}
|	|k(  r5|j                  d       |j                  d       |j                  d        n.t        ||
   || j                        }|j                  |       m |j                  dt        | j                          d| j                   d       dj                  |      S c c}w )	z2Returns a string representation of the block mask.z...z7To print out more, set AttentionMask.to_string(limit=N)zRYou can also index (AttentionMask[batch, head]) to choose a specific batch or head)r(  r  ztorch.Tensor(shape=z, dtype=)r  )ro   	enumerate	itertoolsproductr%  r&  r1  r  tupler&   r'  )r;  r(  limit
dense_mask
batch_dimsnum_rowsnum_cols	total_visr.  idxr-   	block_viss               r   	to_stringzAttentionMask.to_string  s    
*4*:*:'Xx	'	(9(9j;YE!H;Y(Z[ 	(NCe|  '  !Z[  !uv-j.Cy`d`j`jkIY'	( 	.uTZZ/@.A$**UVWXyy## <Zs   C;c                 "    | j                         S r   rK  r;  s    r   __repr__zAttentionMask.__repr__      ~~r4   c                 "    | j                         S r   rM  rN  s    r   __str__zAttentionMask.__str__  rP  r4   r   r  r   c                 $     | |      }||_         |S r   )r  )r7  r   r  ress       r   from_tensorzAttentionMask.from_tensor  s    &k	
r4   r   )   (   r   )r   r   r   r9  r<  rK  rO  rR  classmethodr(   r   strrU  r%   r4   r   r3  r3    sM    J
$$    cDj O  r4   r3  r   )NNN)NN)rV  r  )]r@  collections.abcr   r(   torch.nn.functionalrp   rq   r#  cache_utilsr   configuration_utilsr   utilsr   r   utils.deprecationr	   utils.genericr
   r   utils.import_utilsr   r   r   !torch.nn.attention.flex_attentionr   r   r   r   r   r   r   r~   ,torch._dynamo._trace_wrapped_higher_order_opr   
get_loggerr   loggerr8   r>   rJ   r)   rA   rC   rL   rQ   rT   rX   rZ   r\   r`   rd   rj   ru   rz   r   r   r   r   r   r   float32r&   r   r   r   r   r   __annotations__r   rB  r   r   r   r   r   r  r  r  GREENYELLOWRESETr  r  GREY_SQUAREr  r  r  YELLOW_SQUAREGREEN_SQUARErZ  r1  r3  r%   r4   r   <module>ro     s    $     1 2 . I c c  !gNN I&?RV&W #&?RV&W #02 &T 
		H	%x H h 8 C 3 s C TX 3 # c SV [_ 	3 	8 		 	5<< 	H 	S S S
 
 
hs hx hVS V VQY V    	 	C 	TW 	\d 		)< 	 	Y\ 	afamamptat 	!],,%!]!] !] *	!]
 
!]R (,),,%)) ) 	)
 *) 
)X,,% * 
	< (,,,% * 
	>
 
X 
><<>/4||>HM>bgbnbn>. 2*.!!%(- ddLLd d 	d
 d LL4'd d
d d "&d d d \\DdV 2*.(-<<LL< < 	<
 < LL4'< ;;< "&< < \\<F 2*.$$LL$ $ 	$
 $ LL4'$V 2*.66LL6 6 	6
 6 LL4'6 6r	- 	 8N7O 4 O u||  t@S  @ ?KXMXM<<XM LL9,t3XM LL	XM
 T\XM ,,%XM TzXM 4	)D0#s:;XM LXMv ?K )-(,)-oo<<o LL4'o LL	o
 T\o ,,%o oo  $o \\I$o Lod ?K
 26(,)-UU<<U LL4'U !<<$.	U
 oU  $U \\I$U LUp ?K )-(,)-pp<<p LL4'p LL	p
 T\p ,,%p op  $p \\I$p Lpf ?K
 )-)-LL<<L LL4'L o	L
  $L \\I$L LL^ ?K )-(,)-ss<<s LL4's LL	s
 T\s ,,%s os  $s \\I$s Lsn ):3* & ?K )-(,)-@-@-<<@- LL4'@- LL	@-
 T\@- ,,%@- o@-  $@- L@-J 		D$ (<.0ug.15<< 1`c 1h&ELL &r4   