
    ik                    .   d Z ddlmZ ddlmZ ddlZddlmZmZ ddlm	Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z#  e       rddl$m%Z%  ejL                  e'      Z(dej                  dej                  fdZ)dej                  dej                  fdZ*ee G d de                    Z+dedefdZ,dedefdZ-d Z.d Z/e ed        G d! d"e                    Z0e ed#        G d$ d%e                    Z1 G d& d'ejd                        Z3 G d( d)ejd                        Z4 G d* d+ejd                        Z5 G d, d-ejd                        Z6 G d. d/e      Z7e G d0 d1e             Z8 G d2 d3ejd                        Z9 G d4 d5e8      Z: G d6 d7e8      Z; G d8 d9e8      Z< G d: d;e8      Z=e G d< d=e8             Z> G d> d?ejd                        Z? G d@ dAejd                        Z@ G dB dCe8      ZAg dDZBy)EzPyTorch OWL-ViT model.    )	dataclass)AnyN)Tensornn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleis_vision_availablelogging	torch_int   )OwlViTConfigOwlViTTextConfigOwlViTVisionConfig)center_to_corners_formatlogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr    )r   s    f/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/owlvit/modeling_owlvit.pycontrastive_lossr'   4   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r'   t)r)   caption_loss
image_losss      r&   owlvit_lossr.   9   s,    #J/L!*,,.1J:%,,r(   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)OwlViTOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`OwlViTVisionModel`].
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`OwlViTTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`OwlViTVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)r6   r7   Ngetattrto_tuple.0kselfs     r&   	<genexpr>z(OwlViTOutput.to_tuple.<locals>.<genexpr>_   =      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysrA   s   `r&   r=   zOwlViTOutput.to_tuple^   #     
YY[
 
 	
r(   )__name__
__module____qualname____doc__r1   r#   FloatTensor__annotations__r2   r3   r4   r5   r6   r   r7   rF   r   r=    r(   r&   r0   r0   ?   s    ( &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r(   r0   r+   c                    | j                         r>| j                  t        j                  t        j                  fv r| S | j                         S | j                  t        j                  t        j                  fv r| S | j                         S N)	is_floating_pointdtyper#   float32float64floatint32int64int)r+   s    r&   _upcastr[   f   s`    GGu}}==qL1779LGGU[[99qFquuwFr(   boxesc                 f    t        |       } | dddf   | dddf   z
  | dddf   | dddf   z
  z  S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r   r   )r[   )r\   s    r&   box_arear_   o   sB     ENE!Q$K%1+%%1+ad*CDDr(   c                 ^   t        |       }t        |      }t        j                  | d d d d df   |d d d df         }t        j                  | d d d dd f   |d d dd f         }||z
  j	                  d      }|d d d d df   |d d d d df   z  }|d d d f   |z   |z
  }||z  }	|	|fS )Nr^   r   minr   )r_   r#   maxrb   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r&   box_iouro      s    VEVEyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29LAq!LAq$99E!T'NU"U*E
%-C:r(   c                    | ddddf   | ddddf   k\  j                         st        d|        |ddddf   |ddddf   k\  j                         st        d|       t        | |      \  }}t        j                  | dddddf   |ddddf         }t        j
                  | dddddf   |ddddf         }||z
  j                  d      }|dddddf   |dddddf   z  }|||z
  |z  z
  S )z
    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

    Returns:
        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
    Nr^   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   ra   r   )all
ValueErrorro   r#   rb   rc   rd   )re   rf   rn   rm   top_leftbottom_rightrk   areas           r&   generalized_box_iourv      s*    1ab5MVArrE]*//1WX^W_`aa1ab5MVArrE]*//1WX^W_`aa(JCyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29L1a <1a#88D$,$&&&r(   z6
    Output type of [`OwlViTForObjectDetection`].
    )custom_introc                   D   e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed	<   dZeed
<   dZeed<   dee   fdZy)OwlViTObjectDetectionOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
        image embeddings for each patch.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`OwlViTTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`OwlViTVisionModel`].
    Nr1   	loss_dictr   
pred_boxesr4   r5   class_embedsr6   r7   r   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr:   r;   r>   s     r&   rB   z7OwlViTObjectDetectionOutput.to_tuple.<locals>.<genexpr>   rC   rD   rE   rH   s   `r&   r=   z$OwlViTObjectDetectionOutput.to_tuple   rI   r(   )rJ   rK   rL   rM   r1   r#   rN   rO   rz   dictr   r{   r4   r5   r|   r6   r   r7   rF   r   r=   rP   r(   r&   ry   ry      s    8 &*D%

d
")!Itd{!'+FE$++/J!!D(/,0K""T)0-1L%##d*1-1L%##d*148186:3:
%* 
r(   ry   zM
    Output type of [`OwlViTForObjectDetection.image_guided_detection`].
    c                   0   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZeed	<   dZeed
<   dee   fdZy)&OwlViTImageGuidedObjectDetectionOutputa  
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
        image embeddings for each patch.
    query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
        image embeddings for each patch.
    target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual target image in the batch
        (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual query image in the batch
        (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`OwlViTTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`OwlViTVisionModel`].
    Nr   r5   query_image_embedstarget_pred_boxesquery_pred_boxesr|   r6   r7   r   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr:   r;   r>   s     r&   rB   zBOwlViTImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>
  rC   rD   rE   rH   s   `r&   r=   z/OwlViTImageGuidedObjectDetectionOutput.to_tuple	  rI   r(   )rJ   rK   rL   rM   r   r#   rN   rO   r5   r   r   r   r|   r6   r   r7   rF   r   r=   rP   r(   r&   r   r      s    8 (,FE$+-1L%##d*137))D0726u((4/615e''$.5-1L%##d*148186:3:
%* 
r(   r   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )OwlViTVisionEmbeddingsconfigc                    t         |           |j                  | _        || _        |j                  | _        t        j                  t        j                  |j                              | _
        t        j                  |j                  | j
                  |j                  |j                  d      | _        |j                  |j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j
                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr^   r   position_idsr   
persistent)super__init__
patch_sizer   hidden_size	embed_dimr   	Parameterr#   randnclass_embeddingConv2dnum_channelspatch_embedding
image_sizenum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr$   expandrA   r   	__class__s     r&   r   zOwlViTVisionEmbeddings.__init__  s    ++++!||EKK8J8J,KL!yy++))$$ 
 #--1B1BBqH!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr(   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr   g      ?r   r^   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer#   jit
is_tracingr   r   r   reshapepermuter   r!   interpolateviewcat)rA   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r&   interpolate_pos_encodingz/OwlViTVisionEmbeddings.interpolate_pos_encoding&  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr(   pixel_valuesr   c                 h   |j                   \  }}}}| j                  |      }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )Nr^   r   r   r   )r   r   flatten	transposer   r   r#   r   r   r   r   )
rA   r   r   
batch_size_r   r   patch_embedsr|   r   s
             r&   forwardzOwlViTVisionEmbeddings.forwardL  s    '3'9'9$
Avu++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr(   F)rJ   rK   rL   r   r   r#   r   rZ   r   rN   boolr   __classcell__r   s   @r&   r   r     sm    q1 q*$D5<< $D $DUX $D]b]i]i $DL
E$5$5 
QU 
bgbnbn 
r(   r   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
OwlViTTextEmbeddingsr   c                 ^   t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        | j                  dt        j                  |j                        j                  d      d       y )Nr   r   Fr   )r   r   r   r   
vocab_sizer   token_embeddingmax_position_embeddingsr   r   r#   r$   r   r   s     r&   r   zOwlViTTextEmbeddings.__init__Z  s    !||F,=,=v?Q?QR"$,,v/M/MvOaOa"b 	ELL)G)GHOOPWXej 	 	
r(   N	input_idsr   inputs_embedsr   c                     ||j                   d   n|j                   d   }|| j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr   )r   r   r   r   )rA   r   r   r   
seq_lengthposition_embeddingsr   s          r&   r   zOwlViTTextEmbeddings.forwardd  s{     -6,AY__R(}GZGZ[]G^
,,Q^<L  00;M"55lC"%88
r(   )NNN)rJ   rK   rL   r   r   r#   
LongTensorrN   r   r   r   r   s   @r&   r   r   Y  sk    
/ 
 .20426	##d* &&- ((4/	
 
r(   r   c                        e Zd ZdZ fdZdej                  dedefdZ	 	 ddej                  d	ej                  dz  d
e	dz  de
ej                  ej                  dz  e
ej                     dz  f   fdZ xZS )OwlViTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 
   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )r   r   r   r   r   num_attention_heads	num_headshead_dimrr   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projr   s     r&   r   zOwlViTAttention.__init__{  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar(   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r^   )r   r   r   r   
contiguous)rA   r   r   r   s       r&   _shapezOwlViTAttention._shape  s7    {{3GQQRSUVWbbddr(   Nhidden_statesattention_maskoutput_attentionsr   c                    |j                         \  }}}| j                  |      | j                  z  }| j                  | j	                  |      d|      }	| j                  | j                  |      d|      }
|| j                  z  d| j                  f} | j                  |||      j                  | } |	j                  | }	 |
j                  | }
|	j                  d      }t        j                  ||	j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                |{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t        j                  j                  |d      }|r?|j                  || j                  ||      }|j                  || j                  z  ||      }nd}t        j                  j!                  || j                   | j"                  	      }|j%                  |
j&                        }t        j                  ||
      }|j                         || j                  z  || j                  fk7  r7t        d
|| j                  || j                  f d|j                                |j                  || j                  || j                        }|j                  dd      }|j)                  |||      }| j+                  |      }||fS )z#Input shape: Batch x Time x Channelr   r   r^   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )r   r   r   r   r   r   r   r   r   r#   bmmr   rr   r   r!   softmaxr   r   torT   r   r   )rA   r   r   r   kwargsr   tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r&   r   zOwlViTAttention.forward  sT    #0"4"4"6Wi {{=1DJJ>[[]!;RE
{{4;;}#=r3GDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
  ]]<#5#56
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1!))#w	BmmK0111r(   NF)rJ   rK   rL   rM   r   r#   r   rZ   r   r   rF   r   r   r   s   @r&   r   r   x  s    GB&eU\\ eC ec e /3).	E2||E2 t+E2  $;	E2 
u||U\\D0%2E2LL	ME2r(   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	OwlViTMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y rR   )r   r   r   r	   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r&   r   zOwlViTMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr(   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rR   )r  r
  r  )rA   r   s     r&   r   zOwlViTMLP.forward  s4    /**=9/r(   )rJ   rK   rL   r   r#   r   r   r   r   s   @r&   r  r    s$    KU\\ ell r(   r  c                        e Zd Zdef fdZ	 d
dej                  dej                  dedz  dee	   de
ej                     f
d	Z xZS )OwlViTEncoderLayerr   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r   s     r&   r   zOwlViTEncoderLayer.__init__  sm    ++(0<<F<Q<QRV$<<F<Q<QRr(   r   r   r   Nr   r   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   rP   )r  r  r  r  )rA   r   r   r   r   residualr  outputss           r&   r   zOwlViTEncoderLayer.forward  s    " !((7&4dnn '
')/'
 	'
#| !=0 ((7/ =0 "&Gr(   r   )rJ   rK   rL   r   r   r#   r   r   r   r   rF   rN   r   r   r   s   @r&   r  r    sh    S| S */	&||& &  $;	&
 +,& 
u  	!&r(   r  c                   r    e Zd ZU eed<   dZdZdZdgZ e	j                         dej                  fd       Zy)	OwlViTPreTrainedModelr   owlvit)imagetextTr  modulec                 $   | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              rt	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t(              r|j"                  dz  d|j                   j*                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j,                  j                  |       t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       nt        |t4              r|j                   j6                  dz  d|j                   j*                  z  dz  z  |z  }d|j                   j6                  z  dz  |z  }t	        j
                  |j8                  j                  |       t	        j
                  |j:                  j                  |       nt        |t<              rt	        j
                  |j>                  j                  |j@                  dz  |z         t	        j
                  |jB                  j                  |jD                  dz  |z         t	        jF                  |jH                  | j                   jJ                         nTt        |tL              rDt	        j                  |jN                  |jQ                  |jR                  |jT                               t        |tV        jX                        r>t	        jZ                  |j\                         t	        j^                  |j                         t        |tV        j`                        rOt	        j
                  |j                  d|       |j\                   t	        jZ                  |j\                         y	y	y	)
zInitialize the weights        g{Gz?)meanstdr   r   r   )r'  r^   N)1r   initializer_factor
isinstancer   initnormal_r   r   r   copy_r   r#   r$   r   r   r   r   r   r   initializer_ranger   num_hidden_layersr   r   r   r   r  r   r  r  OwlViTModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dim	constant_logit_scalelogit_scale_init_valueOwlViTForObjectDetectionbox_biascompute_box_biasnum_patches_heightnum_patches_widthr   r  zeros_r   ones_r   )rA   r#  factorin_proj_stdout_proj_stdfc_stds         r&   _init_weightsz#OwlViTPreTrainedModel._init_weights$  s    //f23LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 67LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh0!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B	*!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<,LL&&--))4/&8 LL((//++T1F: NN6--t{{/Q/QR 89JJv(?(?@Y@Y[a[s[s(tufbll+KK$JJv}}%fbii(LLSf={{&FKK( ' )r(   N)rJ   rK   rL   r   rO   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modulesr#   no_gradr   ModulerB  rP   r(   r&   r  r    sH     (&*#-.U]]_*)BII *) *)r(   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dz  dedz  dedz  dedz  d	e	e
z  f
d
Z xZS )OwlViTEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`OwlViTEncoderLayer`].

    Args:
        config: OwlViTConfig
    r   c                     t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        d| _        y c c}w r  )	r   r   r   
ModuleListranger.  r  layersgradient_checkpointing)rA   r   r   r   s      r&   r   zOwlViTEncoder.__init__[  sH    mmvOgOgIh$iA%7%?$ij&+# %js   ANr   r   output_hidden_statesreturn_dictr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	| j                  D ]'  }
|r||	fz   } |
|	|fd|i|}|d   }	|s||d   fz   }) |r||	fz   }|st        d |	||fD              S t        |	||      S )a0  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrP   r   r   r   c              3   &   K   | ]	  }||  y wrR   rP   )r?   vs     r&   rB   z(OwlViTEncoder.forward.<locals>.<genexpr>  s     eqWXWde   )last_hidden_stater   
attentions)r   r   rP  use_return_dictrN  rF   r   )rA   r   r   r   rP  rQ  r   encoder_statesall_attentionsr   encoder_layerlayer_outputss               r&   r   zOwlViTEncoder.forward`  s   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%![[ 	FM#!/=2B!B) #4 	M *!,M !/=3C2E!E	F  +}.>>Ne]NN$Seee+>Vd
 	
r(   NNNN)rJ   rK   rL   rM   r   r   r#   r   r   rF   r   r   r   r   s   @r&   rJ  rJ  R  ss    ,| , /3)-,0#':
 t+:
  $;	:

 #Tk:
 D[:
 
	 :
r(   rJ  c                        e Zd Zdef fdZe	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dedz  dedz  d	edz  d
e	e
z  fd       Z xZS )OwlViTTextTransformerr   c                     t         |   |       |j                  }t        |      | _        t        |      | _        t        j                  ||j                        | _
        | j                          y r  )r   r   r   r   r   rJ  encoderr   r  r  final_layer_norm	post_init)rA   r   r   r   s      r&   r   zOwlViTTextTransformer.__init__  sX     &&	.v6$V, "YF<Q<Q R 	r(   Nr   r   r   r   rP  rQ  r   c           
      l   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j	                         }|j                  d|d         }| j                  ||      }	t        | j                   |	|t        j                  |	j                  d   |	j                        d      }|j                  dd        | j                  d|	||||dd	|}
|
d
   }| j                  |      }|t        j                  |j                  d
   |j                        |j                  t        j                         j#                  d      j                  |j                        f   }|s
||f|
dd z   S t%        |||
j&                  |
j(                        S )a|  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        Nr   )r   r   r   r   )r   r   r   cache_positionpast_key_values	is_causalT)r   r   r   rP  rQ  rg  r   r   rV  pooler_outputr   rW  rP   )r   r   rP  rX  r   r   r   r
   r#   r$   r   r    popra  rb  r   rZ   argmaxr   r   rW  )rA   r   r   r   r   rP  rQ  r   input_shaper   encoder_outputsrV  pooled_outputs                r&   r   zOwlViTTextTransformer.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]nn&NN2{27	),W+;;') <<(;(;A(>}G[G[\ 
 	

;%&$,, 
')/!5#
 
 ,A. 112CD *LL*003<M<T<TULL#**r*2556G6N6NOQ

 %}58KKK)/')77&11	
 	
r(   )NNNNN)rJ   rK   rL   r   r   r   r#   r   r   rF   r   r   r   r   s   @r&   r_  r_    s    	/ 	  /3,0)-,0#'?
<<?
 t+?
 llT)	?

  $;?
 #Tk?
 D[?
 
+	+?
 ?
r(   r_  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
e	 	 	 	 ddej                  d	ej                  dz  d
edz  dedz  dedz  deez  fd       Z xZS )OwlViTTextModelr   )r"  c                 d    t         |   |       t        |      | _        | j	                          y rR   )r   r   r_  
text_modelrc  r   s     r&   r   zOwlViTTextModel.__init__  s&     /7r(   r   c                 B    | j                   j                  j                  S rR   rr  r   r   rH   s    r&   get_input_embeddingsz$OwlViTTextModel.get_input_embeddings  s    ))999r(   c                 :    || j                   j                  _        y rR   rt  )rA   values     r&   set_input_embeddingsz$OwlViTTextModel.set_input_embeddings  s    5:""2r(   Nr   r   r   rP  rQ  c                 .    | j                  |||||      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Examples:
        ```python
        >>> from transformers import AutoProcessor, OwlViTTextModel

        >>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   rP  rQ  )rr  )rA   r   r   r   rP  rQ  r   s          r&   r   zOwlViTTextModel.forward  s)    > )/!5#  
 	
r(   r]  )rJ   rK   rL   r   rO   rD  r   r   rH  ru  rx  r   r#   r   r   rF   r   r   r   r   s   @r&   rp  rp    s     / :bii :;  /3)-,0#'$
<<$
 t+$
  $;	$

 #Tk$
 D[$
 
+	+$
 $
r(   rp  c                        e Zd Zdef fdZe	 	 	 	 ddej                  dedz  dedz  dedz  dedz  d	e	e
z  fd
       Z xZS )OwlViTVisionTransformerr   c                 D   t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t        |      | _
        t	        j
                  |j                  |j                        | _        | j                          y r  )r   r   r   r   r   r  r   r  pre_layernormrJ  ra  post_layernormrc  r   s     r&   r   z OwlViTVisionTransformer.__init__%  sr     08\\&*<*<&BWBWX$V, ll6+=+=6CXCXY 	r(   Nr   r   rP  r   rQ  r   c                 ,   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  j
                  j                  j                  }|j                  |      }| j	                  ||      }| j                  |      } | j                  d||||d|}	|	d   }
|
d d dd d f   }| j                  |      }|s
|
|f|	dd  z   S t        |
||	j                  |	j                        S )N)r   )r   r   rP  rQ  r   r   rh  rP   )r   r   rP  rX  r   r   r   rT   r   r~  ra  r  r   r   rW  )rA   r   r   rP  r   rQ  r   expected_input_dtyper   rm  rV  rn  s               r&   r   zOwlViTVisionTransformer.forward0  s9    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  $>>EEKK#';<Ogh**=9&$,, 
'/!5#	

 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r(   )NNFN)rJ   rK   rL   r   r   r   r#   rN   r   rF   r   r   r   r   s   @r&   r|  r|  $  s    	1 	  *.,005#'+
''+
  $;+
 #Tk	+

 #'++
 D[+
 
+	++
 +
r(   r|  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dedz  d	edz  d
ededz  deez  fd       Z xZS )OwlViTVisionModelr   r   )r!  c                 d    t         |   |       t        |      | _        | j	                          y rR   )r   r   r|  vision_modelrc  r   s     r&   r   zOwlViTVisionModel.__init__d  s'     3F;r(   r   c                 B    | j                   j                  j                  S rR   )r  r   r   rH   s    r&   ru  z&OwlViTVisionModel.get_input_embeddingsj  s      ++;;;r(   Nr   rP  r   rQ  c                 .    | j                  |||||      S )a'  
        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, OwlViTVisionModel

        >>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   rP  r   rQ  )r  )rA   r   r   rP  r   rQ  r   s          r&   r   zOwlViTVisionModel.forwardm  s+    <   %/!5%=# ! 
 	
r(   NNNFN)rJ   rK   rL   r   rO   main_input_namerD  r   r   rH  ru  r   r#   rN   r   rF   r   r   r   r   s   @r&   r  r  _  s    $O!1 <bii <  26)-,0).#'#
''$.#
  $;#
 #Tk	#

 #'#
 D[#
 
+	+#
 #
r(   r  c                       e Zd ZU eed<   def fdZee	 ddej                  dej                  dz  de
e   deez  fd              Zee	 dd	ej                  d
ede
e   deez  fd              Ze	 	 	 	 	 	 	 	 	 ddej"                  dz  d	ej$                  dz  dej                  dz  dedz  dedz  dedz  d
ededz  dedz  deez  fd       Z xZS )r/  r   c                 <   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        |      | _        t        |      | _        t#        j$                  | j                  | j                  d      | _        t#        j$                  | j                  | j                  d      | _        t#        j*                  t-        j.                  |j0                              | _        | j5                          y )NzMconfig.text_config is expected to be of type OwlViTTextConfig but is of type .zQconfig.vision_config is expected to be of type OwlViTVisionConfig but is of type F)r   )r   r   r)  text_configr   	TypeErrortypevision_configr   projection_dimr   r1  r3  r_  rr  r|  r  r   r   r2  r0  r   r#   r   r6  r5  rc  )rA   r   r  r  r   s       r&   r   zOwlViTModel.__init__  sS    &,,.>?++,-Q0 
 &..0BC--./q2 
 ((,,$33)55 - 9 9/<3MB!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<V5R5R(ST 	r(   Nr   r   r   r   c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   rQ  rP   )rr  ri  r0  )rA   r   r   r   text_outputsrn  s         r&   get_text_featureszOwlViTModel.get_text_features  sS    8 4C4?? 4
)4
 	4
 %22%)%9%9-%H"r(   r   r   c                 r     | j                   d||dd|}| j                  |j                        |_        |S )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers.image_utils import load_image
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```Tr   r   rQ  rP   )r  r2  ri  )rA   r   r   r   vision_outputss        r&   get_image_featureszOwlViTModel.get_image_features  sP    2 6GT5F5F 6
%%=6
 	6
 (,'='=n>Z>Z'[$r(   return_lossr   rP  return_base_image_embedsrQ  c
           	      (   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||	      }| j                  |||||	      }|d   }| j                  |      }|d   }| j                  |      }|t        j                  j                  |ddd      z  }|t        j                  j                  |ddd      z  }| j                  j                         j                  |j                        }t        j                  ||j!                               |z  }|j!                         }d}|rt#        |      }|}|	s||||||f}||f|z   S |S t%        |||||||	      S )
aw  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        return_base_image_embeds (`bool`, *optional*):
            Whether or not to return the base image embeddings.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr  rz  r   r^   r   T)ordr   keepdim)r1   r2   r3   r4   r5   r6   r7   )r   r   rP  rX  r  rr  r0  r2  r#   linalgnormr5  expr   r    matmulr+   r.   r0   )rA   r   r   r   r  r   rP  r   r  rQ  r   r  r  r4   r5   text_embeds_normr5  r3   r2   r1   outputs                        r&   r   zOwlViTModel.forward  s   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )/!5# ' 
 #1o**;7%a(--l; $ell&7&7!QS]a&7&bb&):):;ASU_c):)dd &&**,//0C0CD,,'79IJ[X*,,./D&&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r(   rR   r   )	NNNNNNFNN)rJ   rK   rL   r   rO   r   r   r   r#   r   r   r   rF   r   r  r   r  r   rN   r0   r   r   r   s   @r&   r/  r/    s   | @  /3#<<# t+# +,	#
 
+	+#  #J  */ll #' +,	
 
+	+  B  .215.2#')-,0).04#']
##d*]
 ''$.]
 t+	]

 D[]
  $;]
 #Tk]
 #']
 #'+]
 D[]
 
	]
 ]
r(   r/  c                   b     e Zd Zddedef fdZdej                  dej                  fdZ	 xZ
S )OwlViTBoxPredictionHeadr   out_dimc                 "   t         |           |j                  j                  }t	        j
                  ||      | _        t	        j
                  ||      | _        t	        j                         | _	        t	        j
                  ||      | _
        y rR   )r   r   r  r   r   r   dense0dense1GELUgeludense2)rA   r   r  r   r   s       r&   r   z OwlViTBoxPredictionHead.__init__d  sb    $$00iiu-iiu-GGI	iiw/r(   image_featuresr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rR   )r  r  r  r  )rA   r  r  s      r&   r   zOwlViTBoxPredictionHead.forwardm  sM    ^,6"V$6"V$r(   )   )rJ   rK   rL   r   rZ   r   r#   r   rN   r   r   r   s   @r&   r  r  c  s3    0| 0c 0ell u7H7H r(   r  c            	            e Zd Zdef fdZdej                  dej                  dz  dej                  dz  deej                     fdZ	 xZ
S )	OwlViTClassPredictionHeadr   c                    t         |           |j                  j                  }|j                  j                  | _        t        j                  | j
                  |      | _        t        j                  | j
                  d      | _	        t        j                  | j
                  d      | _
        t        j                         | _        y )Nr   )r   r   r  r   r  	query_dimr   r   r  logit_shiftr5  ELUelu)rA   r   r  r   s      r&   r   z"OwlViTClassPredictionHead.__init__w  s    $$00--99ii899T^^Q799T^^Q7668r(   r5   query_embedsN
query_maskr   c                 0   | j                  |      }|S|j                  }|j                  d d \  }}t        j                  ||| j
                  f      j                  |      }||fS |t        j                  j                  |dd      dz   z  }|t        j                  j                  |dd      dz   z  }t        j                  d||      }| j                  |      }	| j                  |      }
| j                  |
      dz   }
||	z   |
z  }||j                  dkD  rt        j                  |d	      }t        j                  |d
k(  t        j                   |j"                        j$                  |      }|j                  t        j&                        }||fS )Nr^   r   T)r   r  gư>z...pd,...qd->...pqr   r   r   r   )r  r    r   r#   zerosr  r   r  r  einsumr  r5  r  ndimr   wherefinforT   rb   rU   )rA   r5   r  r  image_class_embedsr    r   r   pred_logitsr  r5  s              r&   r   z!OwlViTClassPredictionHead.forward  s    "[[6'..F&8&>&>r&B#J++z;&OPSSTZ[K!344 05<<3D3DEW]_im3D3nqu3uv#u||'8'82W['8'\_c'cd ll#79K\Z &&|4&&|4hh{+a/"[0K?!""__ZR@
++jAou{{;CTCT7U7Y7Y[fgK%..7K/00r(   )rJ   rK   rL   r   r   r#   rN   r   rF   r   r   r   s   @r&   r  r  v  s_    	| 	!1''!1 ''$.!1 LL4'	!1
 
u  	!!1r(   r  c                       e Zd ZU eed<   def fdZedededej                  fd       Z
dededej                  fdZ	 ddej                  d	ej                  d
edej                  fdZ	 	 ddej                  dej                  dz  dej                  dz  deej                     fdZ	 	 	 d dej                  dej                  dej                  dedz  dedz  d
edeej                     fdZ	 	 	 d dej                  dedz  dedz  d
edeej                     f
dZ	 ddej                  dej                  d
edej                  fdZe	 	 	 	 	 d!dej                  dej                  dz  dedz  dedz  d
ededz  defd       Ze	 	 	 	 	 d!dej                  dej                  dej                  dz  dedz  dedz  d
ededz  defd       Z xZS )"r7  r   c                    t         |   |       t        |      | _        t	        |      | _        t        |      | _        t        j                  |j                  j                  |j                  j                        | _        t        j                         | _        || _        | j                   j                  j"                  | j                   j                  j$                  z  | _        | j                   j                  j"                  | j                   j                  j$                  z  | _        | j+                  d| j-                  | j&                  | j(                        d       | j/                          y )Nr  r8  Fr   )r   r   r/  r   r  
class_headr  box_headr   r  r  r   r  
layer_normSigmoidsigmoidr   r   r   r:  r;  r   r9  rc  r   s     r&   r   z!OwlViTForObjectDetection.__init__  s    !&)3F;/7,,v';';'G'GVMaMaMpMpqzz|"&++";";"F"F$++JcJcJnJn"n!%!:!:!E!EIbIbImIm!m--d.E.EtG]G]^kp 	 	
 	r(   r:  r;  r   c                 j   t        j                  d|dz   t         j                        }t        j                  d| dz   t         j                        }t        j                  ||d      \  }}t        j                  ||fd      }|dxx   |z  cc<   |dxx   | z  cc<   |j                  dd	      }|S )
Nr   )rT   xy)indexingr   r   .r   .r   r^   )r#   r$   rU   meshgridstackr   )r:  r;  x_coordinatesy_coordinatesxxyybox_coordinatess          r&   !normalize_grid_corner_coordinatesz:OwlViTForObjectDetection.normalize_grid_corner_coordinates  s     Q(9A(=U]]SQ(:Q(>emmT}tLB  ++r2hB7#44#55 *..r15r(   c                    | j                  ||      }t        j                  |dd      }t        j                  |dz         t        j                  | dz         z
  }t        j
                  |d      }|dxx   |z  cc<   |dxx   |z  cc<   t        j                  |dz         t        j                  | dz         z
  }t        j                  ||gd      }|S )Nr%  g      ?g-C6?r  r  r   r   )r  r#   cliploglog1p	full_liker   )rA   r:  r;  r  box_coord_biasbox_sizebox_size_biasr8  s           r&   r9  z)OwlViTForObjectDetection.compute_box_bias  s    @@ASUfg**_c3? ?T#9:U[[/IY\`I`=aa ??>37--..		(T/2U[[(TAQ5RR 99nm<"Er(   image_featsfeature_mapr   c                     | j                  |      }|r$|j                  \  }}}}| j                  ||      }n| j                  }|j	                  |j
                        }||z  }| j                  |      }|S )a  
        Args:
            image_feats:
                Features extracted from the image, returned by the `image_text_embedder` method.
            feature_map:
                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
            interpolate_pos_encoding:
                Whether to interpolate the pre-trained position encodings.
        Returns:
            pred_boxes:
                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
        )r  r   r9  r8  r   r    r  )	rA   r  r  r   r{   r   r:  r;  r8  s	            r&   box_predictorz&OwlViTForObjectDetection.box_predictor  s|    & ]];/
 $:E:K:K7A!#4a,,-?ARSH}}H;;{112h
\\*-
r(   Nr  r  c                 6    | j                  |||      \  }}||fS )a8  
        Args:
            image_feats:
                Features extracted from the `image_text_embedder`.
            query_embeds:
                Text query embeddings.
            query_mask:
                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
        )r  )rA   r  r  r  r  r  s         r&   class_predictorz(OwlViTForObjectDetection.class_predictor  s)     -1OOKWa,b)(/00r(   r   r   r   r   rP  c           	         | j                  ||||||d      }|rX|j                  \  }}}	}
|	| j                  j                  j                  z  }|
| j                  j                  j                  z  }n| j
                  }| j                  }|j                  d   }| j                   j                  j                  |      }t        j                  |d d d dd d f   |d d d df   j                        }|d d dd d d f   |z  }| j                  |      }|j                  d   |||j                  d   f}|j                  |      }|d   }|||fS )NT)r   r   r   r   rP  r   rQ  r   r   r   )r   r   r   r  r   r:  r;  r7   r  r  r#   broadcast_tor  r   )rA   r   r   r   r   rP  r   r  r   r   r   r:  r;  rV  r5   class_token_outnew_sizer4   s                     r&   image_text_embedderz,OwlViTForObjectDetection.image_text_embedder  sv    ++%)/!5%=  
 $"."4"4Aq&%!'4;;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 $77:{{//>>?PQ  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5bk\733r(   c                    | j                   j                  ||d      }|rX|j                  \  }}}}|| j                  j                  j
                  z  }	|| j                  j                  j
                  z  }
n| j                  }	| j                  }
|d   }| j                   j                  j                  |      }t        j                  |d d d dd d f   |d d d df   j                        }|d d dd d d f   |z  }| j                  |      }|j                  d   |	|
|j                  d   f}|j                  |      }||fS )NTr  r   r   r   )r   r  r   r   r  r   r:  r;  r  r#   r  r  r   )rA   r   r   rP  r   r  r   r   r   r:  r;  rV  r5   r  r  s                  r&   image_embedderz'OwlViTForObjectDetection.image_embedderE  s_    11%@Xfj 2 
 $"."4"4Aq&%!'4;;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 +1-{{//>>?PQ  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5n--r(   query_image_featuresquery_feature_mapc                 j   | j                  |      \  }}| j                  |||      }t        |      }g }g }	|j                  }
t	        |j
                  d         D ]  }t        j                  g dg|
      }||   }t        ||      \  }}t        j                  |d   dk(        rt        ||      }t        j                  |      dz  }|d   |k\  j                         }|j                         s||   |j                  d         }t        j                  ||   d      }t        j                   d||      }|t        j"                  |         }|j%                  ||   |          |	j%                  |       " |r+t        j&                  |      }t        j&                  |	      }nd	\  }}|||fS )
Nr   )r   r   r   r   r   r%  g?r   )axiszd,id->iNN)r  r  r   r    rM  r   r#   r   ro   rq   rv   rc   nonzeronumelsqueezer&  r  argminappendr  )rA   r  r  r   r   r|   r{   pred_boxes_as_cornersbest_class_embedsbest_box_indicespred_boxes_deviceieach_query_boxeach_query_pred_boxesiousiou_thresholdselected_indsselected_embeddingsmean_embedsmean_simbest_box_indr  box_indicess                          r&   embed_image_queryz*OwlViTForObjectDetection.embed_image_queryo  s    ../CD<''(<>OQij
 8 D 188+11!45 	6A"\\<.ARSN$9!$<!n.CDGD! yyaC(*>;PQ "IIdOc1M!!W5>>@M""$&21om6K6KA6N&O##jjaqA <<	;@ST,U\\(-CD!((a)FG ''5'	6*  ;;'89L++&67K(2%L+[*44r(   query_pixel_valuesrQ  c           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      d   }| j	                  ||||      \  }}	|j
                  \  }
}}}t        j                  ||
||z  |f      }|j
                  \  }
}}}t        j                  ||
||z  |f      }| j                  |||      \  }}}| j                  ||      \  }}| j                  |||      }|s+|||||||	j                         f}t        d |D              }|S t        ||||||d|	      S )a  
        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values of query image(s) to be detected. Pass in one query image per target image.

        Examples:
        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, OwlViTForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch16")
        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
        >>> with httpx.stream("GET", query_url) as response:
        ...     query_image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model.image_guided_detection(**inputs)
        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.Tensor([image.size[::-1]])
        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_image_guided_detection(
        ...     outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
        ... )
        >>> i = 0  # Retrieve predictions for the first image
        >>> boxes, scores = results[i]["boxes"], results[i]["scores"]
        >>> for box, score in zip(boxes, scores):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
        Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39]
        Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71]
        ```N)r   r   r   )r   r   rP  r   )r  r  c              3   &   K   | ]	  }||  y wrR   rP   r?   xs     r&   rB   zBOwlViTForObjectDetection.image_guided_detection.<locals>.<genexpr>       >1>rU  )r5   r   r   r   r   r|   r6   r7   )r   r   rP  rQ  r  r   r#   r   r  r  r  r=   rF   r   )rA   r   r  r   rP  r   rQ  r  r  r  r   r:  r;  
hidden_dimr  query_image_featsr  r  r   r  r|   r   r  s                          r&   image_guided_detectionz/OwlViTForObjectDetection.image_guided_detection  s   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY !//+F^ 0 

 '+&9&9%/!5%=	 ': '
#^ ITHYHYE
&(9:mmK*>PSd>dfp1qrHYH_H_E
&(9:!MM
,>AR,RT^_
 <@;Q;Q02J<
8&(8
 '+&:&:{am&:&n#l !..{KIab!! '')F >f>>FM5$0/-%" .	
 		
r(   c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||||      \  }	}
}|j
                  }|j                  }|
j                  \  }}}}t        j                  |
|||z  |f      }|j                  d   |z  }|	j                  |||	j                  d         }	|j                  |||j                  d         }|d   dkD  }| j                  ||	|      \  }}| j                  ||
|      }|s9|||	|
||j                         |j                         f}t        d |D              }|S t        |
|	|||||      S )a	  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids).
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
            `vision_model_last_hidden_state` under returned tensors for more detail.

        Examples:
        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch

        >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection

        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.tensor([(image.height, image.width)])
        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_grounded_object_detection(
        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
        ... )
        >>> # Retrieve predictions for the first image for the corresponding text queries
        >>> result = results[0]
        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
        >>> for box, score, text_label in zip(boxes, scores, text_labels):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
        Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
        ```)r   r   r   r   rP  r   r   r   r  c              3   &   K   | ]	  }||  y wrR   rP   r	  s     r&   rB   z3OwlViTForObjectDetection.forward.<locals>.<genexpr>k  r  rU  )r5   r4   r{   r   r|   r6   r7   )r   r   rP  rQ  r  r6   r7   r   r#   r   r  r  r=   rF   ry   )rA   r   r   r   r   rP  r   rQ  r   r  r  r  r  r  r   r:  r;  r  r  max_text_queriesr  r  r|   r{   r  s                            r&   r   z OwlViTForObjectDetection.forward  s   n 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY .2-E-E%)/!5%= .F .
*k7 00 44HSHYHYE
&(9:mmK*>PSd>dfp1qr %??1-;#++J8H,J\J\]_J`a %%j2BIOOTVDWX	v&*
 '+&:&:;V`&a#l ''[BZ[
%%''')F >f>>FM*$$!%* .
 	
r(   r   r  )NNFr  )rJ   rK   rL   r   rO   r   staticmethodrZ   r#   r   r  r9  rN   r   r  rF   r  r  r  r  r   r   r  ry   r   r   r   s   @r&   r7  r7    sA   | $ c VY ^c^j^j   3 3 SXS_S_ , */	&& && #'	
 
		H 26*.	1&&1 ''$.1 LL4'	1
 
u  	!10 *.,0).14<<14 ''14 	14
  $;14 #Tk14 #'14 
u  	!14l *.,0).(.''(.  $;(. #Tk	(.
 #'(. 
u  	!(.\ */	*5#//*5 !,,*5 #'	*5
 
		*5X  8<)-,0).#'g
''g
 "--4g
  $;	g

 #Tkg
 #'g
 D[g
 
0g
 g
R 
 /3)-,0).#'p
<<p
 ''p
 t+	p

  $;p
 #Tkp
 #'p
 D[p
 
%p
 p
r(   r7  )r/  r  rp  r  r7  )CrM   dataclassesr   typingr   r#   r   r    r   r*  activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   r   configuration_owlvitr   r   r   transformers.image_transformsr   
get_loggerrJ   loggerr'   r.   r0   r[   r_   ro   rv   ry   r   rH  r   r   r   r  r  r  rJ  r_  rp  r|  r  r/  r  r  r7  __all__rP   r(   r&   <module>r"     sg    !    & ! / 9 K - &   U T F 
		H	%`U\\ `ell `
-ELL -U\\ - !
; !
  !
JGv G& GEF Ev E""'0 
+
+ +
 +
\ 
*
[ *
 *
ZFRYY FR299 >^2bii ^2D		  /3 /d 2)O 2) 2)jH
BII H
VL
1 L
^5
+ 5
p8
3 8
v2
- 2
j K
' K
 K
\bii &-1		 -1`P
4 P
f wr(   