
    -i5                        d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZ  ej4                  e      Zee G d de                    Z G d dej<                        Z G d dej<                        Z  G d dej<                        Z! G d dej<                        Z" G d dej<                        Z# G d dej<                        Z$ G d dej<                        Z% G d d e      Z& G d! d"ej<                        Z' G d# d$ej<                        Z(e G d% d&e             Z) G d' d(ej<                        Z* G d) d*ej<                        Z+e*e+d+Z, ed,-       G d. d/e)             Z- G d0 d1ej<                        Z. ed2-       G d3 d4e)             Z/g d5Z0y)6zPyTorch TVP Model    N)	dataclass)nn   )initialization)ACT2FN)load_backbone)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)auto_docstringlogging   )	TvpConfigc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)TvpVideoGroundingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Temporal-Distance IoU loss for video grounding.
    logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
        input texts.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    Nlosslogits.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler        `/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   #   sq    	 &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;r!   r   c                   :     e Zd ZdZ fdZd Zd Zd Zd Z xZ	S )TvpLossa~  
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`list[str]`):
            List of all the losses to be applied.
    c                     t         |           | j                  | j                  | j                  d| _        |D ]  }|| j
                  vst        d| d       || _        y )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr1   r   	__class__s      r"   r+   zTvpLoss.__init__B   sj    ==****

  	?D4==( 5n!=>>	? r!   c                     t        j                  ||      t        j                  ||      z
  }t        j                  ||      t        j                  ||      z
  }d|j                  d      |z  z
  }|S )z6
        Measure the intersection over union.
        r   r   min)r   r6   maxclamp)	r2   
start_timeend_timecandidates_start_timecandidates_end_timer)   interunionr'   s	            r"   r,   zTvpLoss.loss_iouO   si     		-x8599EZ\f;gg		-x8599EZ\f;gg%++!+$u,,
r!   c                 P   t        j                  t        j                  ||      d      }t        j                  t        j                  ||      d      }t        j                  t        j                  ||      t        j                  ||      z
  |      j                  d      }|S )z5
        Measure the distance of mid points.
        g       @g?r5   )r   divaddr7   r6   r8   )	r2   r9   r:   r;   r<   r)   mid_candidatesmid_groundtruthdistance_diffs	            r"   r-   zTvpLoss.loss_distanceY   s     599-BDW#XZ]^))EIIj($CSI		IIno6>Sb9ccem

%C%. 	 r!   c                     t        j                  ||      }t        j                  ||      }t        j                  t        j                  t        j                  ||      |            }|j	                  d      }|S )z5
        Measure the difference of duration.
        g?r5   )r   subsquarer@   r8   )	r2   r9   r:   r;   r<   r)   duration_candidatesduration_groundtruthduration_diffs	            r"   r.   zTvpLoss.loss_duratione   sh     $ii(;=RS$yy:>UYYuyy9LNb/cem%no%+++4r!   c                    |\  }}}t        j                  ||      }|dddf   j                         |dddf   j                         }}i }	| j                  D ],  }
|	j	                  |
 | j
                  |
   |||||      i       . |	S )am  
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`list[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        Nr   r   )r   mulfloatr1   updater/   )r2   r   labelsr)   r9   r:   
candidatesr;   r<   losses_dictr   s              r"   forwardzTvpLoss.forwardp   s     *0&*hYYvx0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KK 	D*t}}T*:xAVXkmuvw	
 r!   )
r   r   r   r   r+   r,   r-   r.   rR   __classcell__r3   s   @r"   r$   r$   7   s!    
	r!   r$   c                   $     e Zd Z fdZd Z xZS )TvpVisionModelc           	      \   t         |           t        |      | _        |j                  |j                  j
                  d   }nt        | j                  d      rDt        | j                  j                  d      r$| j                  j                  j
                  d   }nbt        | j                  d      rAt        | j                  j                  d      r!| j                  j                  j                  }nt        d      t        j                  ||j                  ddddd	      | _        y )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r*   r+   r   backbonebackbone_configrZ   hasattrrY   r[   r0   r   Conv2dgrid_encoder_conv)r2   rY   in_channelsr3   s      r"   r+   zTvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H-'$--:N:NP^2_--..;;B?KT]]H-'$--:N:NP]2^--..::K899!#"
r!   c                    |j                   \  }}}}}|j                  ||z  |||      }| j                  |      d   d   }| j                  |      }t        j
                  j                  |dd      }t        j
                  j                  |d      }|j                   dd  \  }	}
}|j                  |||	|
|      }|j                  ddd	d
d      }|S )Nfeature_mapsr      )r\   r]   T)inplacer   r      )	shapeviewra   re   r   
functional
max_pool2drelupermute)r2   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r"   rR   zTvpVisionModel.forward   s    >J>P>P;
Jfe#((j)@,PVX]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*ZyyZj)T||Aq!Q*r!   r   r   r   r+   rR   rS   rT   s   @r"   rV   rV      s    
.r!   rV   c                   ~     e Zd ZdZ fdZdej                  dededej                  fdZdde	fd	Z
dde	fd
Z xZS )TvpVisualInputEmbeddingz;
    Takes input of both image and video (multi-frame)
    c                 r   t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _
        t        j                  d|j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        |j                  | _        |j                  | _	        y )Nr   eps)r*   r+   r   	Embeddingmax_position_embeddingsr[   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr2   rY   r3   s     r"   r+   z TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r!   	embeddingrw   rx   returnc                     dx}}|| j                   kD  r|| j                   z  }|| j                  kD  r|| j                  z  }|j                  dddd      }t        j                  j                  |||fdd      }|j                  dddd      }|S )z
        This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   r   r   ri   bicubicFscale_factormodealign_corners)r   r   rr   r   ro   interpolate)r2   r   rw   rx   h0w0s         r"   interpolate_pos_encodingz0TvpVisualInputEmbedding.interpolate_pos_encoding   s     RD999$???B4888>>>B%%aAq1	MM--b	 . 
	 %%aAq1	r!   r   c                    |j                   \  }}}}t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }	dt        |j                         dz
  z  |d|fz   }
 |	j                  |
 }	t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }|d||f} |j                  | }|	|z   }|r6|| j                  kD  s|| j                  kD  r|| j                  |||      z   }|S ||z   }|S )af  
        Args:
            grid: (batch_size, height, width, hidden_dim)
            interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        dtypedevice)r   r   r   )rm   r6   r   r   arangelongr   r   lenrn   r   r   r   )r2   rz   r   rt   rw   rx   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r"   add_2d_positional_embeddingsz4TvpVisualInputEmbedding.add_2d_positional_embeddings   sQ    15

-
FE: >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	">"9">">	"J ==uE	 <<	DKKX"&">">?O"PIz:	">"9">">	"J 7:Q Q $T:::edFkFk>k$778MvW\]]D  //Dr!   c                    |j                   \  }}}}}|j                  d      }| j                  ||      }|j                  |d|      }|j                   dd }	|j                  }
t        j                  |	t
        j                  |
      }| j                  |      }||z   }| j                  |      }| j                  |      }|S )a  
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1
            interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        r   r   rX   Nr   )rm   meanr   rn   r   r   zerosr   r   r   r   )r2   rz   r   rt   ru   rw   rx   rv   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r"   rR   zTvpVisualInputEmbedding.forward   s     ?Cjj;
J|yy|00Ph0i		*b,?+11#26%% %8

SYZ $ : :> J"%::
__Z0
\\*-
r!   F)r   r   r   r   r+   r   Tensorintr   boolr   rR   rS   rT   s   @r"   r   r      sT    
X%,,  TW \a\h\h .'4 'Rd r!   r   c                   *     e Zd ZdZ fdZddZ xZS )TvpTextInputEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        y )N)padding_idxr   )r*   r+   r   r   
vocab_sizer[   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r"   r+   zTvpTextInputEmbeddings.__init__#  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r!   c                 .   ||j                         }n|j                         d d }|d   }||j                  n|j                  }|Ft        j                  |t        j                  |      }|j                  d      j                  |      }|&t        j                  |t        j                  |      }|| j                  |      }| j                  |      }| j                  |      }	||z   |	z   }
| j                  |
      }
| j                  |
      }
|
S )NrX   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   r   r   r   r   )r2   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r"   rR   zTvpTextInputEmbeddings.forward+  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"%88;PP
__Z0
\\*-
r!   NNNNr   r   r   r   r+   rR   rS   rT   s   @r"   r   r      s    Q>r!   r   c                   ^     e Zd Z fdZdej
                  dedefdZ	 	 d	dedz  fdZ	 xZ
S )
TvpAttentionc                    t         |           |j                  |j                  z  dk7  r1t	        |d      s%t        d|j                   d|j                         |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t        j$                  |j                  |j&                        | _        t        j                  |j*                        | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r*   r+   r[   num_attention_headsrc   r0   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   r   s     r"   r+   zTvpAttention.__init__E  sx    : ::a?PVXhHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=r!   tensorsequence_lengthrt   c                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   ri   )rn   r   r   	transpose
contiguous)r2   r   r   rt   s       r"   _reshapezTvpAttention._reshapeY  s7    KK
OT5M5MtOgOghYq!_Z\	
r!   Noutput_attentionsc                 ,   |j                   d d \  }}| j                  |      }| j                  |      }| j                  |      }| j	                  |||      }	| j	                  |||      }
| j	                  |||      }t        j                  |	|
j                  dd            }|t        j                  | j                        z  }|||z   }t        j                  j                  |d      }| j                  |      }t        j                  ||      }|j                  dd      j                         }|j!                  ||| j"                        }| j%                  |      }| j'                  |      }| j)                  ||z         }|r||f}|S |f}|S )Nri   rX   dimr   )rm   r   r   r   r   r   matmulr   mathsqrtr   r   ro   softmaxr   r   reshaper   r   r   r   )r2   r   attention_maskr   rt   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                   r"   rR   zTvpAttention.forward`  s    '4&9&9"1&=#
O JJ}5((=1 JJ}5mm$5
SMM/?JO	mm$5
S !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ++O<ll?K@!++Aq1<<>!))*otGYGYZjj-ll;/ookM&AB4E;0 MX>r!   NN)r   r   r   r+   r   r   r   r   r   rR   rS   rT   s   @r"   r   r   D  sA    >(
u|| 
c 
s 
 )-	&  $;	&r!   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )TvpIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y N)r*   r+   r   r   r[   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r"   r+   zTvpIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r!   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )r2   r   s     r"   rR   zTvpIntermediate.forward  s&    

=100?r!   r   r   r   r+   r   r   rR   rS   rT   s   @r"   r   r     s#    9U\\ ell r!   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )TvpOutputLayerc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _	        t        j                  |j                        | _        y )Nr   )r*   r+   r   r   r   r[   r   r   r   r   r   r   r   r   s     r"   r+   zTvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r!   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   )r   r   r   )r2   r   r  s      r"   rR   zTvpOutputLayer.forward  s7    

=1]3(DEr!   r  rT   s   @r"   r  r    s1    >U\\  RWR^R^ r!   r  c                   6     e Zd Z fdZ	 	 ddedz  fdZ xZS )TvpEncodeLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r   )r*   r+   r   	attentionr   intermediater  outputr   s     r"   r+   zTvpEncodeLayer.__init__  s3    %f-+F3$V,r!   Nr   c                     | j                  |||      }|d   }|dd  }| j                  |      }| j                  ||      }|f|z   }|S )N)r   r   r   )r  r  r  )	r2   r   r   r   self_attention_outputsattention_outputr   intermediate_outputlayer_outputs	            r"   rR   zTvpEncodeLayer.forward  so     "&/ "0 "

 2!4(,"//0@A{{#68HI/G+r!   r   )r   r   r   r+   r   rR   rS   rT   s   @r"   r	  r	    s#    - )-	  $;	r!   r	  c            
       X     e Zd Z fdZ	 	 	 	 ddedz  dedz  dedz  deez  fdZ xZS )	
TvpEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r*   r+   rY   r   
ModuleListrangenum_hidden_layersr	  layergradient_checkpointing)r2   rY   _r3   s      r"   r+   zTvpEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#Nr   output_hidden_statesreturn_dictr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}t	        | j
                        D ](  \  }}	|r||fz   } |	|||      }
|
d   }|s ||
d   fz   }* |r||fz   }|s|f}|r||fz   }|r||fz   }|S t        ||r|nd |r|      S d       S )Nr    r   r   )last_hidden_stater   r   )rY   r  r   r  	enumerater  r
   )r2   r   r   r   r  r  all_hidden_statesall_attentionsilayer_modulelayer_outputsr   s               r"   rR   zTvpEncoder.forward  s+    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4 	FOA|#$58H$H!(HYZM)!,M !/=3C2E!E	F   1]4D D$&G#!%6$88 !^$55N+/C+):~
 	
 AE
 	
r!   r   )	r   r   r   r+   r   r   r
   rR   rS   rT   s   @r"   r  r    sT    , )-,0#'*
  $;	*

 #Tk*
 D[*
 
	 *
r!   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	TvpPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r*   r+   r   r   r[   r   Tanh
activationr   s     r"   r+   zTvpPooler.__init__  s9    YYv1163E3EF
'')r!   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r*  )r2   r   first_token_tensorpooled_outputs       r"   rR   zTvpPooler.forward  s6     +1a40

#566r!   r  rT   s   @r"   r'  r'    s#    $
U\\ ell r!   r'  c                   l    e Zd ZU eed<   dZdZdZ ej                         de
j                  fd       Zy)TvpPreTrainedModelrY   model)videotextTmodulec                    t        |t        j                  t        j                  f      r7t	        j
                  |j                  d| j                  j                         nt        |t        j                        r?t	        j                  |j                         t	        j                  |j                         nt        |t        j                        rOt	        j                  |j                  dd       |j                  Pt	        j                  |j                  d       n/t        |t               rt	        j
                  |j"                         t        |t        j                        r+|j                  t	        j                  |j                         t%        |d      rt	        j
                  |j&                         t%        |d	      rt	        j
                  |j(                         t%        |d
      rt	        j
                  |j*                         t%        |d      r t	        j
                  |j,                         yy)zInitialize the weights        )r   stdfan_outrq   )r   nonlinearityNr   pad_uppad_downpad_left	pad_right)r   r   r   r   initnormal_weightrY   initializer_ranger   zeros_r`   ones_rd   kaiming_normal_	constant_TvpModeltext_promptrc   r9  r:  r;  r<  )r2   r3  s     r"   _init_weightsz TvpPreTrainedModel._init_weights  sY    fryy",,78LLSdkk6S6ST-KK$JJv}}%		*  YVT{{&v{{A.)LL++,fbii(V[[-DKK$68$LL'6:&LL)6:&LL)6;'LL))* (r!   N)r   r   r   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointingr   no_gradr   ModulerG  r    r!   r"   r/  r/    s?    (&*#U]]_+BII + +r!   r/  c                   (     e Zd ZdZ fdZd Z xZS )TvpFrameDownPadPrompterz>
    Pad frames extracted from videos only at the bottom.
    c           	      |   |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                  | _        |j                   | _         t        j                  t        j                  d|j
                  d|j                  |j                  g            | _        y )NrA   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr0   r*   r+   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnr:  r   s     r"   r+   z TvpFrameDownPadPrompter.__init__,  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r!   c                    | j                   dk7  rst        j                  | j                  | j                  g|j                  |j
                        }d|| j                  | j                  z
  | j                  d d f<   ||z  }| j                   dk7  rt        j                  |j                  d   |j                  d   d| j                  | j                  g|j
                        }| j                  | j                  z
  }| j                  |d d d d d d || j                  d d f<   ||j                  |j                        z  }|S )	NrA   r   r5  rR  r   r   r   r   )rT  r   onesrW  r   r   rU  r   rm   r:  to)r2   rs   visual_prompt_maskpromptstart_points        r"   rR   zTvpFrameDownPadPrompter.forward:  s1   %%.!&""D$5$56l>P>PYeYlYl" fit0043J3JJTM^M^^`aab..L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK$*;*;;Q>?FIIl&8&899Lr!   r   rT   s   @r"   rN  rN  '  s    
r!   rN  c                   p     e Zd ZdZ fdZdej                  dededej                  fdZd
de	fd	Z
 xZS )TvpFramePadPrompterz?
    Pad frames extracted from videos in the surroundings.
    c           
         |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                   | _         |j
                  |j                  dz  z
  | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        y )NrP  rS  ri   r   r   )rT  r0   r*   r+   ru   rW  rU  	base_sizer   rX  r   rY  r9  r:  r;  r<  r   s     r"   r+   zTvpFramePadPrompter.__init__Q  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r!   r_  rw   rx   r   c                    || j                   z  || j                   z  }}|j                  \  }}}}	}
|j                  ||z  ||	|
      }t        j                  j                  |||fdd      }|j                  |||||      }|S )z
        This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   Fr   )rW  rm   r   r   ro   r   )r2   r_  rw   rx   r   r   batchru   channelsprompt_heightprompt_widths              r"   interpolate_pad_encodingz,TvpFramePadPrompter.interpolate_pad_encodingw  s     $+++UT5F5F-FBCI<<@z8]L 
 2Hm\Z**b	 + 
 z8VUKr!   rj  c                 Z   |r|j                   d   |j                   d   fn| j                  | j                  f\  }}| j                  dvrt        d| j                         | j                  dv r3t	        j
                  ||g|j                  |j                        }||z  }| j                  dv rt	        j                  d| j                  d	| j                  | j                  |j                  
      }t	        j                  | j                  || j                  gd      }t	        j                  | j                  || j                  gd	      }t	        j                  |j!                  d      |gz        }|r| j#                  |||      }||j%                  |j                        z   }|S )Nr   rX   )rA   rR  rQ  z$Invalid visual_prompter_apply value )rQ  rR  r   )rQ  rA   r   r   r[  rl   r   r   )rm   rW  rT  r0   r   r\  r   r   r   ru   rd  catr;  r<  r9  r:  r   rj  r]  )r2   rs   rj  rw   rx   r^  baser_  s           r"   rR   zTvpFramePadPrompter.forward  s{    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VUO<CUCU^j^q^q!r..L%%);;;;q$//1dnndnn]i]p]pqDYYtT^^D!LFYYVT]]CKFYY|003vh>?F'66vvuM'&))L4F4F*GGLr!   r   )r   r   r   r   r+   r   r   r   rj  r   rR   rS   rT   s   @r"   rb  rb  L  sG    $
Lu|| S QT Y^YeYe 0d r!   rb  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                        e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	e
dz  d
e
dz  de
deez  fd       Z xZS )rE  c                 "   t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        t        |      | _
        t        |      | _        t        j                  t        j                   dd|j"                  g            | _        t        j&                  |j(                        | _        |j,                  t.        vrt1        d      t/        |j,                     |      | _        | j5                          y )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r*   r+   rY   rV   vision_modelr   r   r   visual_embeddingsr  encoderr'  poolerr   rX  r   rY  r[   rF  r   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr0   visual_prompter	post_initr   s     r"   r+   zTvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r!   c                 .    | j                   j                  S r   r   r   )r2   s    r"   get_input_embeddingszTvpModel.get_input_embeddings  s    ...r!   c                 &    || j                   _        y r   r}  )r2   r   s     r"   set_input_embeddingszTvpModel.set_input_embeddings  s    */'r!   Nr   rs   r   r   r  r  r   r   c                    ||n| j                   j                  }| j                  | j                  ||            }| j	                  |      }	| j                  ||      }
||j                  |
j                  dd       }t        j                  |j                  d   d      j                  |j                  |j                        }t        j                  |||gd	
      }| j                  ||j                               j                  |j                        }| j                   j#                  |	j                  d   d	d	      }t        j                  ||	|
gd
      }| j%                  |||||      }|r|j&                  n|d   }| j)                  |      }| j+                  |      }| j+                  |      }|s
||f|dd z   S t-        |||j.                  |j0                        S )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel

        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)rj  )r   r   ri   r   rs  )r   r   rX   r   r   )r   r   r  r  )r  pooler_outputr   r   )rY   r  rt  rz  r   ru  new_onesrm   r   r\  r]  r   r   rl  get_extended_attention_maskr   rF  r   rv  r  rw  r   r   r   r   )r2   r   rs   r   r   r  r  r   kwargstext_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskrF  embedding_outputencoder_outputsr  r-  s                     r"   rR   zTvpModel.forward  s   4 &1%<k$++BYBY((  H` a
 !%) D"&"8"83K #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N "==ninnN^_bbclcscstN&&--.C.I.I!.LbRTU 99k3HJa%bhij,,)/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r!   )NNNNNNF)r   r   r   r+   r~  r  r   r   
LongTensorr   r   r   r   rR   rS   rT   s   @r"   rE  rE    s     /0  .21526)-,0#').E
##d*E
 ''$.E
 ((4/	E

  $;E
 #TkE
 D[E
 #'E
 
+	+E
 E
r!   rE  c                   $     e Zd Z fdZd Z xZS )TvpVideoGroundingHeadc                 :   t         |           t        j                  |j                  |j                  dz        | _        t        j                  |j                  dz  d      | _        t        j                         | _        t        j                         | _
        y )Nri   )r*   r+   r   r   r[   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r"   r+   zTvpVideoGroundingHead.__init__  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr!   c                     | j                  | j                  |            }| j                  | j                  |            }|S r   )r  r  r  r  )r2   r  r   s      r"   rR   zTvpVideoGroundingHead.forward  s9    ""4<<#>?""4<<#78r!   r~   rT   s   @r"   r  r    s    )r!   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                        e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  deej                     dz  de
dz  de
dz  d	e
dz  d
e
deez  fd       Z xZS )TvpForVideoGroundingc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r*   r+   rY   rE  r0  r  video_grounding_headr{  r   s     r"   r+   zTvpForVideoGrounding.__init__%  s:     f%
$9&$A!r!   Nr   rs   r   rO   r   r  r  r   r   c	           	         ||n| j                   j                  }| j                  |||||||      }
|
d   }| j                  |      }d}|pt	        g d      }|j                  | j                          |||      }|d   | j                   j                  |d   z  z   | j                   j                  |d   z  z   }|s|f|
dd z   }
||f|
z   }
|
S t        |||
j                  |
j                  	      S )
a  
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r   r  r  r   r   r&   r'   r(   r)   ri   )r   r   r   r   )rY   r  r0  r  r$   r]  r   distance_loss_weightduration_loss_weightr   r   r   )r2   r   rs   r   rO   r   r  r  r   r  r   r  r   r   	criterion	loss_dicts                   r"   rR   zTvpForVideoGrounding.forward-  s)   < &1%<k$++BYBY**/!5#%=  
  
**=9 ?@ILL%!&&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r!   )NNNNNNNF)r   r   r   r+   r   r   r  r   r   r   r   r   rR   rS   rT   s   @r"   r  r    s      .21526-1)-,0#').?
##d*?
 ''$.?
 ((4/	?

 ell#d*?
  $;?
 #Tk?
 D[?
 #'?
 
(	(?
 ?
r!   r  )rE  r/  r  )1r   r   dataclassesr   r   r    r   r=  activationsr   backbone_utilsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   utilsr   r   configuration_tvpr   
get_loggerr   loggerr   rL  r$   rV   r   r   r   r   r  r	  r  r'  r/  rN  rb  ry  rE  r  r  __all__r    r!   r"   <module>r     s     !   & ! + 9 X X - , ( 
		H	% <k <  <$Mbii M`%RYY %Pnbii nb!RYY !HB299 BLbii RYY / 41
 1
j		  + + +B"bii "JW")) Wv ,#   
]
! ]

]
@BII  
I
- I

I
X Er!   