
    ib                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$  e jJ                  e&      Z' G d dejP                        Z)	 ddl*m+Z+ e+Z)e'jY                  d        G d dejP                        Z0 G d dejP                        Z1 G d dejP                        Z2 G d de      Z3 G d dejP                        Z4e G d d e             Z5e G d! d"e5             Z6 G d# d$ejP                        Z7 G d% d&ejP                        Z8 G d' d(ejP                        Z9 G d) d*ejP                        Z: G d+ d,ejP                        Z; G d- d.e      Z< ed/0       G d1 d2e5             Z= ed30       G d4 d5e5e             Z>g d6Z?y# e-$ r Y /e.$ r e'j_                  d       Y Gw xY w)7zPix2Struct modeling file    N)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfigc                   &     e Zd Zd fd	Zd Z xZS )Pix2StructLayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      n/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr    zPix2StructLayerNorm.__init__4   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor"   float32powmeanrsqrtr%   r$   dtypefloat16bfloat16)r&   hidden_statesvariances      r*   forwardzPix2StructLayerNorm.forward<   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r+   )gư>__name__
__module____qualname__r    r:   __classcell__r)   s   @r*   r   r   3   s    $+r+   r   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _
        t        j                  |j                        | _        y N)r   r    r   Linearpatch_embed_hidden_sizer'   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr&   rD   r)   s     r*   r    z#Pix2StructVisionEmbeddings.__init__`   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r+   flattened_patchesc                 "   |d d d d df   j                         }|d d d d df   j                         }|d d d d dd f   }| j                  |      }| j                  |      }| j                  |      }||z   |z   }| j	                  |      }|S )Nr   r   r-   )longrJ   rM   rN   rQ   )r&   rS   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r*   r:   z"Pix2StructVisionEmbeddings.forwardi   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  .0>A
\\*-
r+   )
r<   r=   r>   __doc__r   r    r"   Tensorr:   r?   r@   s   @r*   rC   rC   Y   s7    7/ 7D 7 %,, r+   rC   c                   ,     e Zd Z fdZ	 	 	 ddZ xZS )Pix2StructVisionAttentionc                 |   t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        | j                  | j                  z  | _	        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        d| _        y NFbias)r   r    r'   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrQ   	inner_dimr   rH   querykeyvalueoutputgradient_checkpointingrR   s     r*   r    z"Pix2StructVisionAttention.__init__~   s    !--"(++11//(?(??YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r+   c                 (    |j                   dd \  } fd} | j                  |            } | j                  |            } | j                  |            }	t	        j
                  ||j                  dd            }
|t	        j                  d j                  ||f|
j                  |
j                        } j                  r j                  rd|_        |j                         dk(  r*||ddddddf   j                  |j                        z   }nw|||j                  |j                        z   }nVt!               sLt	        j"                  |f|j                  |j                        }||j                  |j                        z   }d|z
  }|j%                  |dk(  t	        j&                  |
j                        j(                        }|
|z  }
t	        j*                  |
t	        j,                  t	        j&                  |
j                        j(                              }
t.        j0                  j3                  |
dt        j4                  	      j7                  |
      }t.        j0                  j9                  | j8                   j                  
      }t	        j
                  ||	      }|j                  dd      j;                         j=                  d j>                        } jA                  |      }|f|fz   }|r||fz   }|S )z&
        Self-attention block
        Nr-   c                     | j                         j                  dj                  j                        j	                  dd      S )
projectionr.   r   r-   )
contiguousviewrf   rd   	transpose)states
batch_sizer&   s    r*   to_projection_shapez>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr+   r   r   devicer5   Tr.   )dimr5   ptraining)!shaperi   rj   rk   r"   matmulrs   zerosrf   rx   r5   rm   r|   requires_gradry   r0   r   r#   masked_fillfinfominmaxtensorr   
functionalsoftmaxr1   type_asrQ   rq   rr   rh   rl   )r&   r8   attention_maskposition_biasoutput_attentions
seq_lengthrv   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsru   s   `              @r*   r:   z!Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE lJ,@,@A,FG !KKDLL*j9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-/!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,bll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr+   )NNFr;   r@   s   @r*   r^   r^   }   s    ,$ Gr+   r^   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructVisionMlprD   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y r`   r   r    r   rH   r'   d_ffwi_0wi_1worO   rP   rQ   r   dense_act_fnactrR   s     r*   r    zPix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r+   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rG   r   r   r   rQ   
isinstancer   r$   r"   r\   r5   int8r0   r&   r8   hidden_geluhidden_linears       r*   r:   zPix2StructVisionMlp.forward       hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r+   )r<   r=   r>   r   r    r:   r?   r@   s   @r*   r   r      s    /5 /r+   r   c                        e Zd Zdeddf fdZ	 	 d	dej                  dej                  dz  dedeej                  ej                  f   eej                     z  fdZ	 xZ
S )
Pix2StructVisionLayerrD   rE   Nc                 *   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r(   )r   r    chunk_size_feed_forwardseq_len_dimr^   	attentionr   mlpr   r'   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrR   s     r*   r    zPix2StructVisionLayer.__init__   ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r+   r8   r   r   c                     |}| j                  |      }| j                  |||      }|d   }|dd  }||z   }| j                  |      }| j                  |      |z   }|f|z   }|S )N)r   r   r   r   )r   r   r   r   )	r&   r8   r   r   residualself_attention_outputsattention_outputr   layer_outputs	            r*   r:   zPix2StructVisionLayer.forward   s     ! 55mD!%)/ "0 "

 2!4(, )83 ..}=xx-=/G+r+   NF)r<   r=   r>   r   r    r"   r\   booltupler:   r?   r@   s   @r*   r   r      sz    k/ kD k /3"'	|| t+  	
 
u||U\\)	*U5<<-@	@r+   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  dej                  dz  deded	edee	z  fd
Z
 xZS )Pix2StructVisionEncoderrD   rE   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r   r    rD   r   
ModuleListrangenum_hidden_layersr   layerrm   )r&   rD   _r)   s      r*   r    z Pix2StructVisionEncoder.__init__  sP    ]]5QWQiQiKj#ka$9&$A#kl
&+# $ls   A#r8   r   r   output_hidden_statesreturn_dictc                     |rdnd }|rdnd }t        | j                        D ](  \  }}	|r||fz   } |	|||      }
|
d   }|s ||
d   fz   }* |r||fz   }|st        d |||fD              S t        |||      S )N r   r   c              3   &   K   | ]	  }||  y wrG   r   .0vs     r*   	<genexpr>z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>?  s     mq_`_lms   last_hidden_stater8   
attentions)	enumerater   r   r   )r&   r8   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_outputss              r*   r:   zPix2StructVisionEncoder.forward%  s     #7BD$5b4(4 		POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#		P   1]4D Dm]4EGZ$[mmm++*
 	
r+   )NFFT)r<   r=   r>   r   r    r"   r\   r   r   r   r:   r?   r@   s   @r*   r   r     sv    ,5 ,$ , /3"'%* 
||
 t+
  	

 #
 
 
	 
r+   r   c                   d    e Zd ZU eed<   dZdZed        Z e	j                         d        Zd Zy)Pix2StructPreTrainedModelrD   )imagetextFc                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r"   r   r   r   )r&   r   
input_maskdummy_inputss       r*   r   z&Pix2StructPreTrainedModel.dummy_inputsN  s6    LL.	\\*-
!*"&0

 r+   c                    | j                   j                  }t        |t              r$t	        j
                  |j                  |dz         yt        |t              r8t        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j                  n| j                   j                  }t	        j                  |j                  j                  d||dz  z         t        |j                  d      r?|j                  j                  )t	        j                   |j                  j                         t	        j                  |j"                  j                  d||dz  z         t        |j"                  d      r?|j"                  j                  )t	        j                   |j"                  j                         t	        j                  |j$                  j                  d||dz  z         t        |j$                  d      rA|j$                  j                  *t	        j                   |j$                  j                         yyyt        |t&              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j(                  n| j                   j                  }t        | j                   t              r | j                   j                  j*                  n| j                   j*                  }t	        j                  |j,                  j                  d|||z  dz  z         t	        j                  |j.                  j                  d||dz  z         t	        j                  |j0                  j                  d||dz  z         t	        j                  |j2                  j                  d|||z  dz  z         |j4                  r3t	        j                  |j6                  j                  d||dz  z         yyt        |t8        j:                        rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t	        j                  |j                  d||dz  z         |j<                  Et?        |j                  dd      s-t	        j                   |j                  |j<                            yyyt        |t@              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t	        j                  |jB                  j                  d||dz  z         yt        |t8        jD                  t8        jF                  f      rct	        jH                  |j                  d| j                   jJ                         |j                   t	        j                   |j                         yyt        |t              r-|j                   t	        jL                  |j                         yyt        |t8        j:                        rt	        j                  |j                  d| j                   jJ                         |j<                  Et?        |j                  dd      s-t	        j                   |j                  |j<                            yyyy)	zInitialize the weights      ?g        g      )r3   stdrb   N_is_hf_initializedF)'rD   initializer_factorr   r   init	constant_r$    Pix2StructTextDenseGatedActDenser   text_configr'   r   normal_r   hasattrrb   zeros_r   r   Pix2StructTextAttentionrc   	num_headsri   rj   rk   rl   has_relative_attention_biasrelative_attention_biasr   rK   padding_idxgetattrPix2StructTextModellm_headrH   Conv2dtrunc_normal_initializer_rangeones_)r&   modulefactorr'   r   rd   rf   s          r*   _init_weightsz'Pix2StructPreTrainedModel._init_weightsY  sE    //f12NN6==&3,7 @A dkk+;< ''33[[,, 
 4>dkkK[3\4;;**//bfbmbmbrbrDLL++#6kVZEZ;[\v{{F+0@0@0LFKK,,-LL++#6kVZEZ;[\v{{F+0@0@0LFKK,,-LL))DT>9RSvyy&)fiinn.HFIINN+ /I) 78 dkk+;< ''33[[,,  1;4;;HX0Y'',,_c_j_j_v_v 
 dkk+;< ''11[[**  LL,,3F{UgGglpFp<qrLL**&KQUDU:VWLL,,3FkSWFW<XYLL--CVRdHdimGm=no11V;;BBRX]hmq\qRrs 2- dkk+;< ''33[[,,  LLSfQU@U6VW!!-gfmmMach6iFMM&*<*<=> 7j- 34 dkk+;< ''33[[,,  LL..SfY]H]>^_BII 67v}}3DKK<Y<YZ{{&FKK( ' 34}}(

6==) )-LLSdkk6S6ST!!-gfmmMach6iFMM&*<*<=> 7j- .r+   c                 8   | j                   j                  }| j                   j                  }|t        d      |j	                  |j
                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |dk(  |       |S )Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information..r.   r   ).r   z1self.model.config.pad_token_id has to be defined.)rD   decoder_start_token_idpad_token_id
ValueError	new_zerosr}   clonemasked_fill_)r&   r   r   r   shifted_input_idss        r*   _shift_rightz&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)< 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r+   N)r<   r=   r>   r   __annotations__input_modalities_can_compile_fullgraphpropertyr   r"   no_gradr   r  r   r+   r*   r   r   G  sJ    ("  U]]_I? I?X!r+   r   c                        e Zd ZU eed<   dZdZdZdgZdef fdZ	d Z
e	 	 	 	 	 ddej                  dz  d	ej                  dz  d
edz  dedz  dedz  deez  fd       Z xZS )Pix2StructVisionModelrD   rS   )r   Tr   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        | j                          y Nr   )r   r    rD   rC   rX   r   encoderr   r'   r   	layernorm	post_initrR   s     r*   r    zPix2StructVisionModel.__init__  sU     4V<.v6,V-?-?VEZEZ[ 	r+   c                 .    | j                   j                  S rG   )rX   rJ   r&   s    r*   get_input_embeddingsz*Pix2StructVisionModel.get_input_embeddings  s    ///r+   Nr   r   r   r   rE   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |#|j                  d      dk7  j                         }| j                  |      }| j                  |||||      }|d   }	| j                  |	      }	|s|	f}
|
|dd z   S t        |	|j                  |j                        S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr.   ry   r   )r   r   r   r   r   r   )rD   r   r   use_return_dictr   sumfloatrX   r  r  r   r8   r   )r&   rS   r   r   r   r   kwargsembedding_outputencoder_outputssequence_outputhead_outputss              r*   r:   zPix2StructVisionModel.forward  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$DEE!/333;q@GGIN??+<=,,)/!5# ' 
 *!,..9+-L/!""555-)77&11
 	
r+   )NNNNN)r<   r=   r>   r   r  main_input_namer  supports_gradient_checkpointing_no_split_modulesr    r  r   r"   r\   r   r   r   r:   r?   r@   s   @r*   r
  r
    s    "")O!&*#01
5 
0  26.2)-,0#'H
 <<$.H
 t+H
  $;	H

 #TkH
 D[H
 
+	+H
 H
r+   r
  c                   *     e Zd Zdef fdZd Z xZS )r   rD   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y r`   r   rR   s     r*   r    z)Pix2StructTextDenseGatedActDense.__init__!  r   r+   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rG   r   r   s       r*   r:   z(Pix2StructTextDenseGatedActDense.forward)  r   r+   r<   r=   r>   r   r    r:   r?   r@   s   @r*   r   r      s    /3 /r+   r   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructTextLayerFFrD   c                     t         |           t        |      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y r  )r   r    r   DenseReluDenser   r'   layer_norm_epsilon
layer_normr   rO   rP   rQ   rR   s     r*   r    zPix2StructTextLayerFF.__init__>  sK    >vF-f.@.@fF_F_`zz&"5"56r+   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S rG   )r)  r'  rQ   )r&   r8   forwarded_statess      r*   r:   zPix2StructTextLayerFF.forwardF  s=    ??=9../?@%5E(FFr+   r#  r@   s   @r*   r%  r%  =  s    73 7r+   r%  c                   b     e Zd Zddededz  f fdZed	d       Zd
dZ	 	 	 	 	 	 	 	 ddZ	 xZ
S )r   NrD   	layer_idxc                    t         |           || _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        |j                  | _        | j                  | j                  z  | _        || _        |-t        j                  d| j                   j"                   d       t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        | j                  r/t%        j0                  | j                  | j                        | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fra   )r   r    r   relative_attention_num_bucketsrelative_attention_max_distancer'   rc   rd   r   rf   rP   rQ   rh   r-  loggerwarning_oncer)   r<   r   rH   ri   rj   rk   rl   rK   r   rm   r&   rD   r   r-  r)   s       r*   r    z Pix2StructTextAttention.__init__N  sg   +F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(&+#r+   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r-   r   )r0   r"   rU   absr   
zeros_likelogr  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r*   _relative_position_bucketz1Pix2StructTextAttention._relative_position_bucketj  s(   . AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r+   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  |d| j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r5   rx   F)r<  r=  r>  )r-   r   r   r   )r   r$   rx   r"   arangerU   r0   rC  r/  r0  permute	unsqueeze)
r&   query_length
key_lengthrx   cache_positioncontext_positionmemory_positionr;  relative_position_bucketvaluess
             r*   compute_biasz$Pix2StructTextAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A;;==	 $B $
  --.FG	*44Q7r+   c
                    |j                   dd \  }
}|du}| j                  |      }|j                  |
d| j                  | j                        j                  dd      }|Qt        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |
d| j                  | j                        j                  dd      }|j                  |
d| j                  | j                        j                  dd      }|D|s|	nd}	|j%                  ||| j                  d|	i      \  }}|rd|j                  | j                  <   t'        j(                  ||j                  dd            }||j                   d   }||n|	d   dz   }| j*                  sZt'        j,                  d| j                  ||f|j.                  |j0                  	      }| j2                  rE| j4                  r9d|_        n1| j9                  |||j.                  |	
      }|dddd| dddf   }|#|ddddddd|j                   d   f   }||z   }|}||z  }t:        j<                  j?                  |jA                         d      jC                  |      }t:        j<                  jE                  || jD                  | j4                        }t'        j(                  ||      }|j                  dd      jG                         }|j                  |
d| jH                        }| jK                  |      }||f}|r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr-   r.   r   rJ  Tr   rw   )rx   rJ  r  rz   )&r}   ri   rr   rf   rd   rs   r   r	   
is_updatedgetr-  cross_attention_cacheself_attention_cachelayerskeysrN  rj   rk   updater"   r~   r   r   rx   r5   rm   r|   r   rO  r   r   r   r  r   rQ   rq   rh   rl   )r&   r8   maskkey_value_statesr   past_key_valuesrH  	use_cacher   rJ  ru   r   is_cross_attentionr   rR  curr_past_key_valuescurrent_statesr   r   r   rI  real_seq_lengthcausal_maskr   r   r   r   s                              r*   r:   zPix2StructTextAttention.forward  s   " "/!4!4Ra!8
J .T9zz-0#((RtG^G^_iijkmno &:oGZ+[(3377GJ!'6'L'L$'6'K'K$#2 -?)]/j-44T^^DIIJ/66t~~FMML.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*7It+?+F+Fdnn?OQ_>`,(
L &AEO..t~~> lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!&&z2t~~Fkk+../Gr+   FN)T       )NN)NNNNNFFN)r<   r=   r>   r   intr    staticmethodrC  rO  r:   r?   r@   s   @r*   r   r   M  sX    ,3 ,cfimcm ,8 -  - `0 ar+   r   c                   @     e Zd Zddedz  f fdZ	 	 	 	 	 	 ddZ xZS ) Pix2StructTextLayerSelfAttentionNr-  c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nr   r-  r   r   r    r   r   r   r'   r(  r)  r   rO   rP   rQ   r3  s       r*   r    z)Pix2StructTextLayerSelfAttention.__init__  sU    00KW`
 .f.@.@fF_F_`zz&"5"56r+   c           	          | j                  |      }| j                  |||||||      }	|| j                  |	d         z   }|f|	dd  z   }
|
S )N)rY  r   r[  r\  r   rJ  r   r   r)  r   rQ   )r&   r8   r   r   r[  r\  r   rJ  normed_hidden_statesr   r   s              r*   r:   z(Pix2StructTextLayerSelfAttention.forward  sq      $}=>> '+/) * 
 &5Ea5H(II "%5ab%99r+   rb  )NNNFFNr<   r=   r>   re  r    r:   r?   r@   s   @r*   rh  rh    s-    7SSWZ 7 r+   rh  c                   B     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 ddZ xZS )!Pix2StructTextLayerCrossAttentionNr-  c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFrj  r   rk  )r&   rD   r-  r)   s      r*   r    z*Pix2StructTextLayerCrossAttention.__init__:  sP    0UZfop-f.@.@fF_F_`zz&"5"56r+   c
                     | j                  |      }
| j                  |
||||||||		      }|| j                  |d         z   }|f|dd  z   }|S )N)rY  rZ  r   r[  r\  rH  r   rJ  r   r   rm  )r&   r8   rZ  r   r   r[  r\  rH  r   rJ  rn  r   r   r   s                 r*   r:   z)Pix2StructTextLayerCrossAttention.forward@  sv      $}=>> -'+%/) * 

 %t||4DQ4G'HH/$4QR$88r+   rG   )NNNFNFNro  r@   s   @r*   rq  rq  9  s/    7#* 7 r+   rq  c                   H     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )Pix2StructTextBlockNr-  c                     t         |           t        |||      | _        t	        ||      | _        t        |      | _        y )Nrj  )r-  )r   r    rh  self_attentionrq  encoder_decoder_attentionr%  r   r3  s       r*   r    zPix2StructTextBlock.__init__^  sH    >(C
 *K*
&
 )0r+   c           
         | j                  ||||||	|      }|d   }|dd  }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|d u}|r| j                  ||||||d   dz   ||	      }|d   }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }||dd  z   }| j                  |      }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|f}||z   S )N)r   r   r[  r\  r   rJ  r   r   i  )r   r   r.   )rZ  r   r   r[  rH  r\  r   )rw  r5   r"   r6   isinfanyr   r   clamprx  r   )r&   r8   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr[  r\  r   r   rJ  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                     r*   r:   zPix2StructTextBlock.forwardn  s    "&!4!4)'+/) "5 "
 /q12126 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM2$>&*&D&D!65; /+B/!3#"3 'E 	'# 4A6M ""emm3M8R8V8V8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM "***r+   rb  )
NNNNNNFFTNro  r@   s   @r*   ru  ru  ]  s9    1SSWZ 1& "#&*?+r+   ru  z3
    The standalone text decoder of Pix2Struct
    )custom_introc                       e Zd ZU eed<   dZdgZddiZdZ fdZ	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  ded	z  dej                  d	z  ded	z  dej                  d	z  deej                  df   ez  fd       Z xZS )r   rD   )r   ru  zlm_head.weightzembed_tokens.weightTc                 V   t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t        |j
                  |j                        | _        t        j                   |j"                        | _        t        j&                  |j
                  |j                  d      | _        | j+                          d| _        y c c}w )Nr   rj  r   Fra   )r   r    r   rK   
vocab_sizer'   embed_tokensr   r   
num_layersru  r   r   r   r(  final_layer_normrO   rP   rQ   rH   r   r  rm   )r&   rD   r   r)   s      r*   r    zPix2StructTextModel.__init__  s     LL):):F<N<NO]] v001 $FQRSV`ab

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   &!D&c                     || _         y rG   )r  r&   new_embeddingss     r*   set_input_embeddingsz(Pix2StructTextModel.set_input_embeddings  s
    *r+   Nr   r   r}  r~  inputs_embedsr[  r\  r   r   labelsr   rJ  rE   .c                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }| j
                  r%| j                  r|rt        j                  d       d}||t        d      |&|j                         }|j                  d|d         }n!||j                         dd }nt        d      |$| j                  J d       | j                  |      }|\  }}|rc|a| j                   j                  r5t        t        | j                         t        | j                               }nt        | j                         }d	}||d	   }n||j!                         }|%t#        j$                  |||z   |j&                  
      }|9||j!                         |z   n|}t#        j(                  |||j&                  
      }| j                   j*                  rt-        | j                   ||||      }nX|ddddddf   }|j/                  |j0                        }d|z
  t#        j2                  |j0                        j4                  z  }|M|j                         \  }}}||f}|!t#        j(                  ||j&                  
      }| j7                  |      }nd}|	rdnd}|rdnd}|rdnd}d}d}| j9                  |      }t;        | j<                        D ]L  \  }} |	r||fz   } | ||||||||||
      }!|!d	   }|!d   }|	|!|rdnd   }|s8||!d   fz   }|D||!d   fz   }N | j?                  |      }| j9                  |      }| jA                  |      }"|	r||fz   }d}#|
|
j/                  |"j&                        }
tC        jD                  dd      }$ |$|"jG                         j                  d|"j                  d            |
jG                         j                  d            }#|stI        d |#|"||||fD              S tK        |#|"||||      S )aU  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer.   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddings)rD   r   )rx   )rD   r  r   rJ  r[  )r5   r   r   )r[  r\  r   rJ  r   r   r-      r   r3   )ignore_index	reductionc              3   $   K   | ]  }|| 
 y wrG   r   r   s     r*   r   z.Pix2StructTextModel.forward.<locals>.<genexpr>  s       = s   )losslogitsr[  r8   r   cross_attentions)&rD   r\  r   r   r  rm   r|   r1  warningr   sizerr   r  is_encoder_decoderr	   r   get_seq_lengthr"   rE  rx   r#   
is_decoderr   r0   r5   r   r   invert_attention_maskrQ   r   r   r  r   r   CrossEntropyLossrq   r   r   )%r&   r   r   r}  r~  r  r[  r\  r   r   r  r   rJ  r  input_shaperu   r   past_key_values_lengthmask_seq_lengthra  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  r8   r   r   r   r  r  loss_fcts%                                        r*   r:   zPix2StructTextModel.forward  s   V "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]&&4==YNNl I ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
J0{{--"5 4l$++6V# #/dkk"B!"%%3A%6"(%4%C%C%E"!"\\&(>(KTaThThN ! BQA\..0:=bl  #ZZ
OML`L`aN;;!!,{{+-- /K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK !,=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+"6BD0d&7rd(,%]3(4 	VOA|#$58H$H!(%/- /#"3-M *!,M
 *!,M$00=CTaZ[0\- !/=3C2E!E(4+?=QRCSBU+U(;	V> --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD  #%"(   1++%1
 	
r+   )NNNNNNNNNNNN)r<   r=   r>   r   r  r  r  _tied_weights_keysr  r    r  r   r"   
LongTensorFloatTensorr   r   r   r   r:   r?   r@   s   @r*   r   r     sv    !  ./*,AB&*#,&+  .237:>;?15(,!%)-,0*.#'26E
##d*E
 ))D0E
  %0047	E

 !& 1 1D 8E
 ''$.E
 E
 $;E
  $;E
 #TkE
   4'E
 D[E
 ((4/E
 
u  #%	&)J	JE
 E
r+   r   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c                       e Zd ZU eed<   dZdef fdZd Zd Zde	j                  fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d	z  d
ej                  d	z  dej                   d	z  dej"                  d	z  deeej                        d	z  ded	z  dej                   d	z  dej(                  d	z  ded	z  ded	z  ded	z  ded	z  dej                   d	z  deej                     ez  fd       Z xZS )"Pix2StructForConditionalGenerationrD   rS   c                     t         |   |       t        |j                        | _        t        |j                        | _        |j                  | _        | j                          y rG   )
r   r    r
  vision_configr  r   r   decoderis_vqar  rR   s     r*   r    z+Pix2StructForConditionalGeneration.__init__  sK     ,V-A-AB*6+=+=>mm 	r+   c                 6    | j                   j                         S rG   )r  r  r  s    r*   r  z7Pix2StructForConditionalGeneration.get_input_embeddings  s    ||0022r+   c                 :    | j                   j                  |       y rG   )r  r  r  s     r*   r  z7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).9r+   rE   c                 6    | j                   j                         S rG   )r  get_output_embeddingsr  s    r*   r  z8Pix2StructForConditionalGeneration.get_output_embeddings  s    ||1133r+   c                 :    | j                   j                  |       y rG   )r  set_output_embeddingsr  s     r*   r  z8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:r+   Nr   r   r   r  r[  r  decoder_inputs_embedsr\  r   r   r   rJ  c                    |	|	n| j                   j                  j                  }	||n| j                   j                  }|| j	                  |||
||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|U|S|Q| j                  |      }||n2|j                  | j                   j                        j                         }d|dddf<   | j                  |||||||	|
||||      }|s||z   S t        |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j"                  |j$                  	      S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)rS   r   r   r   r   r   r   r-   r   )r   r   r  r[  r}  r~  r\  r   r   r  r   rJ  )	r  r  r[  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater}  encoder_attentions)rD   r   r\  r  r  r   r   lenr  ner   r  r  r   r  r  r[  r8   r   r  r   )r&   rS   r   r   r   r  r[  r  r  r\  r   r   r   rJ  r  r8   decoder_outputss                    r*   r:   z*Pix2StructForConditionalGeneration.forward  s   P "+!6IDKK<S<S<]<]	%0%<k$++B]B] ""ll"3-"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1/!5#) ' 
 "_44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r+   )NNNNNNNNNNNNN)r<   r=   r>   r   r  r  r    r  r  r   Moduler  r  r   r"   r  r  
BoolTensorr   r   r\   r   r   r:   r?   r@   s   @r*   r  r    s    )O	/ 	3:4ryy 4;  7;3759:>BF(,*.59!%)-,0#'26d
 ,,t3d
 ))D0d
 !++d2	d

 !& 0 04 7d
 uU%6%6784?d
 d
   4'd
  %||d2d
 $;d
  $;d
 #Tkd
 D[d
 ((4/d
  
u  	!$6	6!d
 d
r+   r  )r   r  r
  r   )@r[   r8  r"   r    r   r   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_pix2structr   r   r   
get_loggerr<   r1  r  r   apex.normalizationrA   infoImportError	Exceptionr  rC   r^   r   r   r   r   r
  r   r%  r   rh  rq  ru  r   r  __all__r   r+   r*   <module>r     s
       & ! C C ) / 9  .  e d 
		H	%+")) +2
a/&
KKij! !HW		 Wv")) :&6 &R&
bii &
R q! q! q!h _
5 _
 _
Fryy :BII  Ebii ERryy F!		 !HP+4 P+f 
c
3 c

c
L 
@
)BO @

@
Fc(  	 a
NN_`as   G G.G.-G.