
    ꬜i                        d Z ddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&  e"jN                  e(      Z)e e d       G d de                    Z*e e d       G d de                    Z+ee  G d de                    Z,dejZ                  dejZ                  fdZ.dejZ                  dejZ                  fdZ/d e&d!e0fd"Z1d[d#e0e2z  d$e3fd%Z4 G d& d'e	jj                        Z6 G d( d)e	jn                        Z8 G d* d+e	jj                        Z9 G d, d-e	jj                        Z: G d. d/e	jj                        Z; G d0 d1e	jj                        Z< G d2 d3e	jj                        Z= G d4 d5e	jj                        Z> G d6 d7e	jj                        Z?	 d\d8e	jj                  d9ejZ                  d:ejZ                  d;ejZ                  d<ejZ                  dz  d=e@d>e@fd?ZA G d@ dAe	jj                        ZB G dB dCe	jj                        ZC G dD dEe	jj                        ZD G dF dGe	jj                        ZE G dH dIe	jj                        ZF G dJ dKe      ZG G dL dMe	jj                        ZH G dN dOe	jj                        ZIe  G dP dQe             ZJ e dR       G dS dTeJ             ZK e dU       G dV dWeJ             ZLe  G dX dYeJ             ZMg dZZNy)]zPyTorch ALIGN model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   y)AlignVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    r!   tuple     d/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/align/modeling_align.pyr   r   +   sN    
 .2L%##d*126u((4/659M5**+d29r+   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZe
ej                     dz  ed<   y)AlignTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr    r!   
attentions)r"   r#   r$   r%   r/   r&   r'   r(   r    r!   r)   r0   r*   r+   r,   r.   r.   <   sh    
 -1K""T)026u((4/659M5**+d2926Je''(4/6r+   r.   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)AlignOutputar  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AlignTextModel`].
    vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr/   r   text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r6   r7   N)getattrto_tuple).0kselfs     r,   	<genexpr>z'AlignOutput.to_tuple.<locals>.<genexpr>m   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)r)   keysr?   s   `r,   r<   zAlignOutput.to_tuplel   s#     
YY[
 
 	
r+   )r"   r#   r$   r%   r3   r&   r'   r(   r4   r5   r/   r   r6   r   r7   r   r)   r   r<   r*   r+   r,   r2   r2   N   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*14818DHAH
%* 
r+   r2   logitsr8   c                     t         j                  j                  | t        j                  t        |       | j                        d      S )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr&   arangelenrF   )rC   s    r,   contrastive_lossrL   u   s5    ==&&vu||CKPVP]P]/^ps&ttr+   
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)rL   t)rM   caption_loss
image_losss      r,   
align_lossrR   y   s,    #J/L!*,,.1J:%,,r+   confignum_channelsc                     | j                   }|| j                  z  }t        |t        ||dz  z         |z  |z        }|d|z  k  r||z  }t        |      S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rS   rT   divisornew_dims       r,   round_filtersr]      sf     ""GF,,,L'3|gk9:gEOPG |##7w<r+   kernel_sizeadjustc                     t        | t              r| | f} | d   dz  | d   dz  f}|r|d   dz
  |d   |d   dz
  |d   fS |d   |d   |d   |d   fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rV   r   )
isinstancerZ   )r^   r_   corrects      r,   correct_padrc      s}     +s#"K01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r+   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rS   c                    t         |           t        |d      | _        t	        j
                  d      | _        t	        j                  |j                  | j                  dddd      | _	        t	        j                  | j                  |j                  |j                  	      | _        t        |j                     | _        y )
N    )r   r   r   r   paddingr   rV   validFr^   strideri   bias)epsmomentum)super__init__r]   out_dimr   	ZeroPad2dri   Conv2drT   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr	   
hidden_act
activationr?   rS   	__class__s     r,   rq   zAlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r+   pixel_valuesr8   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)ri   ru   ry   r{   )r?   r~   featuress      r,   forwardzAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r+   )
r"   r#   r$   r%   r   rq   r&   Tensorr   __classcell__r}   s   @r,   re   re      s0    	40 	4ELL U\\ r+   re   c                   .     e Zd Z	 	 	 	 	 	 	 d fd	Z xZS )AlignVisionDepthwiseConv2dc	                 @    ||z  }	t         
|   ||	|||||||	       y )N)	in_channelsout_channelsr^   rl   ri   dilationgroupsrm   padding_mode)rp   rq   )r?   r   depth_multiplierr^   rl   ri   r   rm   r   r   r}   s             r,   rq   z#AlignVisionDepthwiseConv2d.__init__   s=     #%55#%#% 	 
	
r+   )r   r   r   r   r   Tzeros)r"   r#   r$   rq   r   r   s   @r,   r   r      s$     
 
r+   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z
 xZS )
AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rS   in_dimrr   rl   c                     t         |           t        j                  ||ddd      | _        t        j
                  ||j                        | _        t        |j                     | _
        y )Nr   sameFr   r   r^   ri   rm   )num_featuresrn   )rp   rq   r   rt   expand_convrv   rw   	expand_bnr	   rz   
expand_act)r?   rS   r   rr   rl   r}   s        r,   rq   z"AlignVisionExpansionLayer.__init__   sZ    99 
 W&BWBWX !2!23r+   r!   r8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r?   r!   s     r,   r   z!AlignVisionExpansionLayer.forward   s4    ((7}56r+   )r"   r#   r$   r%   r   rZ   rq   r&   r'   r   r   r   r   s   @r,   r   r      sH    
40 
4# 
4 
4UX 
4U%6%6 5<< r+   r   c            
       p     e Zd ZdZdededededef
 fdZdej                  d	ej                  fd
Z xZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rS   r   rl   r^   adjust_paddingc                 b   t         |           || _        | j                  dk(  rdnd}t        ||      }t	        j
                  |      | _        t        ||||d      | _        t	        j                  ||j                  |j                        | _        t        |j                     | _        y )	NrV   rj   r   )r_   rh   Frk   r   rn   ro   )rp   rq   rl   rc   r   rs   depthwise_conv_padr   depthwise_convrv   rw   rx   depthwise_normr	   rz   depthwise_act)	r?   rS   r   rl   r^   r   conv_padri   r}   s	           r,   rq   z"AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7Fk.A"$,,w"?8FHSX
 !nnV%:%:VE_E_
 $F$5$56r+   r!   r8   c                     | j                   dk(  r| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S )NrV   )rl   r   r   r   r   r   s     r,   r   z!AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r+   r"   r#   r$   r%   r   rZ   boolrq   r&   r'   r   r   r   r   s   @r,   r   r      sZ    7!7 7 	7
 7 7,	U%6%6 	5<< 	r+   r   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    rS   r   
expand_dimexpandc                    t         |           |r|n|| _        t        dt	        ||j
                  z              | _        t        j                  d      | _	        t        j                  | j                  | j                  dd      | _        t        j                  | j                  | j                  dd      | _        t        |j                     | _        t        j                          | _        y )Nr   )output_sizer   )r   r   r^   ri   )rp   rq   dimrY   rZ   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezert   reducer   r	   rz   
act_reduceSigmoid
act_expand)r?   rS   r   r   r   r}   s        r,   rq   z&AlignVisionSqueezeExciteLayer.__init__"  s    !':V!S&*H*H!HIJ++:ii	
 ii	
 !!2!23**,r+   r!   r8   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }t        j                  ||      }|S r   )r   r   r   r   r   r&   mul)r?   r!   inputss      r,   r   z%AlignVisionSqueezeExciteLayer.forward7  sc    ]3M26M26		&-8r+   )Fr   r   s   @r,   r   r     sH    '0 '# '3 'X\ '*
U%6%6 
5<< 
r+   r   c                        e Zd ZdZdedededededef fdZd	e	j                  d
e	j                  de	j                  fdZ xZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rS   r   rr   rl   	drop_rateid_skipc                    t         |           |dk(  xr | | _        t        j                  ||ddd      | _        t        j                  ||j                  |j                        | _	        t        j                  |      | _        y )Nr   r   Fr   r   )p)rp   rq   apply_dropoutr   rt   project_convrv   rw   rx   
project_bnDropoutdropout)r?   rS   r   rr   rl   r   r   r}   s          r,   rq   z#AlignVisionFinalBlockLayer.__init__I  sz     	#q[8[II 
 .. f&;&;fF`F`
 zzI.r+   
embeddingsr!   r8   c                     | j                  |      }| j                  |      }| j                  r| j                  |      }||z   }|S r   )r   r   r   r   )r?   r   r!   s      r,   r   z"AlignVisionFinalBlockLayer.forwardZ  sG    ))-86 LL7M)J6Mr+   r"   r#   r$   r%   r   rZ   floatr   rq   r&   r'   r   r   r   r   s   @r,   r   r   D  sj    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf r+   r   c                        e Zd ZdZdededededededed	ed
ef fdZde	j                  de	j                  fdZ xZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rS   r   rr   rl   expand_ratior^   r   r   r   c
                 l   t         |           || _        | j                  dk7  | _        ||z  }
| j                  rt	        |||
|      | _        t        || j                  r|
n||||	      | _        t        |||
| j                        | _	        t        || j                  r|
n|||||      | _        y )Nr   )rS   r   rr   rl   )rS   r   rl   r^   r   )rS   r   r   r   )rS   r   rr   rl   r   r   )rp   rq   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)r?   rS   r   rr   rl   r   r^   r   r   r   expand_in_dimr}   s              r,   rq   zAlignVisionBlock.__init__  s     	(''1,-;;6fmFDN 8$(KK=V#)
 <&]4;;
 5$(KK=V
r+   r!   r8   c                     |}| j                   dk7  r| j                  |      }| j                  |      }| j                  |      }| j	                  ||      }|S Nr   )r   r   r   r   r   )r?   r!   r   s      r,   r   zAlignVisionBlock.forward  s[    "
! NN=9M++M: ++M:
MBr+   r   r   s   @r,   r   r   e  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
r+   r   c            	       f     e Zd ZdZdef fdZ	 	 d
dej                  dedz  dedz  de	fd	Z
 xZS )AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rS   c                     t                    |j                   _         fdt        |j                        }t        fd|j                  D              }d}g }t        |      D ]  }t        ||j                  |         }t        ||j                  |         }|j                  |   }	|j                  |   }
|j                  |   }t         |j                  |               D ]c  }|dk(  }|dkD  rdn|	}	|dkD  r|n|}||j                  v}|j                  |z  |z  }t        ||||	|
||||	      }|j!                  |       |dz  }e  t#        j$                  |       _        y )Nc                 Z    t        t        j                  j                  | z              S r   )rZ   mathceildepth_coefficient)repeatsr?   s    r,   round_repeatsz2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr+   c              3   .   K   | ]  } |        y wr   r*   )r=   nr   s     r,   r@   z.AlignVisionEncoder.__init__.<locals>.<genexpr>  s     Laq)Ls   r   r   )	rS   r   rr   rl   r^   r   r   r   r   )rp   rq   r   rK   r   sumnum_block_repeatsranger]   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)r?   rS   num_base_blocks
num_blockscurr_block_numr   ir   rr   rl   r^   r   jr   r   r   blockr   r}   s   `                @r,   rq   zAlignVisionEncoder.__init__  s   !'!9!9	D f001L63K3KLL
' 	$A"66+=+=a+@AF#FF,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEF $q&!e$%Ev!/v7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#'$	$8 mmF+r+   r!   output_hidden_statesNreturn_dictr8   c                     |r|fnd }| j                   D ]  } ||      }|s||fz  } |st        d ||fD              S t        ||      S )Nc              3   &   K   | ]	  }||  y wr   r*   )r=   vs     r,   r@   z-AlignVisionEncoder.forward.<locals>.<genexpr>  s     Xq!-Xs   )r    r!   )r   r)   r   )r?   r!   r   r   all_hidden_statesr   s         r,   r   zAlignVisionEncoder.forward  so     1E],$[[ 	6E!-0M#!m%55!	6
 X]4E$FXXX-++
 	
r+   )FT)r"   r#   r$   r%   r   rq   r&   r'   r   r   r   r   r   s   @r,   r   r     sW    ),0 ),\ -2#'	
((
 #Tk
 D[	

 
2
r+   r   c                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  f
d	Z	 xZ
S )AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxrn   position_idsr   F)
persistenttoken_type_ids)dtype)rp   rq   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   register_bufferr&   rJ   r   r   r   sizelongr|   s     r,   rq   zAlignTextEmbeddings.__init__  s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r+   N	input_idsr   r   inputs_embedsr8   c                 6   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr   r   r   r   r   rF   )r  r   hasattrr   r   r&   r   r  rF   r  r	  r  r
  r   )r?   r  r   r   r  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr	  r   r  s               r,   r   zAlignTextEmbeddings.forward  s1     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r+   )NNNN)r"   r#   r$   r%   rq   r&   
LongTensorr'   r   r   r   r   s   @r,   r   r      s~    Q
$ .2260426&##d*& ((4/& &&-	&
 ((4/& 
&r+   r   modulequerykeyvalueattention_maskscalingr   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrV   r   r   )r   r   )r   trainingr   )r&   matmul	transposer   rH   softmaxfloat32tor   r   r!  
contiguous)
r  r  r  r  r  r  r   kwargsattn_weightsattn_outputs
             r,   eager_attention_forwardr+  <  s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r+   c                        e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZ xZS )
AlignTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )rp   rq   r  num_attention_headsr  
ValueErrorrS   rZ   attention_head_sizeall_head_sizer   Linearr  r  r  r   attention_probs_dropout_probr   attention_dropoutr  r|   s     r,   rq   zAlignTextSelfAttention.__init__S  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r+   Nr!   r  output_attentionsr(  r8   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  sdn| j                  | j                  d|\  }} |j                  g |d j!                         }|r||f}|S |f}|S )Nr   r   rV           )r   r  )shaper3  r  viewr#  r  r  r   get_interfacerS   _attn_implementationr+  r!  r7  r  reshaper'  )r?   r!   r  r8  r(  r  hidden_shapequery_states
key_statesvalue_statesattention_interfacer*  r)  outputss                 r,   r   zAlignTextSelfAttention.forwardh  sT    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFH1B;- JUr+   NFr"   r#   r$   rq   r&   r   r'   r   r   r   r)   r   r   r   s   @r,   r-  r-  R  sg    60 48).	|| ))D0  $;	
 +, 
u||	r+   r-  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )rp   rq   r   r5  r  denser
  r  r   r  r   r|   s     r,   rq   zAlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r+   r!   input_tensorr8   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rL  r   r
  r?   r!   rM  s      r,   r   zAlignTextSelfOutput.forward  7    

=1]3}|'CDr+   r"   r#   r$   rq   r&   r   r   r   r   s   @r,   rI  rI    1    >U\\  RWR^R^ r+   rI  c                        e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZ xZS )
AlignTextAttentionc                 b    t         |           t        |      | _        t	        |      | _        y r   )rp   rq   r-  r?   rI  outputr|   s     r,   rq   zAlignTextAttention.__init__  s&    *62	)&1r+   Nr!   r  r8  r(  r8   c                 n     | j                   |f||d|}| j                  |d   |      }|f|dd  z   }|S N)r  r8  r   r   )r?   rW  )r?   r!   r  r8  r(  self_outputsattention_outputrE  s           r,   r   zAlignTextAttention.forward  s\     !tyy
)/
 	
  ;;|AF#%QR(88r+   rF  rG  r   s   @r,   rU  rU    sg    2 48).	|| ))D0  $;	
 +, 
u||	r+   rU  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rp   rq   r   r5  r  intermediate_sizerL  ra   rz   strr	   intermediate_act_fnr|   s     r,   rq   zAlignTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r+   r!   r8   c                 J    | j                  |      }| j                  |      }|S r   )rL  ra  r   s     r,   r   zAlignTextIntermediate.forward  s&    

=100?r+   rR  r   s   @r,   r]  r]    s#    9U\\ ell r+   r]  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y rK  )rp   rq   r   r5  r_  r  rL  r
  r  r   r  r   r|   s     r,   rq   zAlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r+   r!   rM  r8   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rO  rP  s      r,   r   zAlignTextOutput.forward  rQ  r+   rR  r   s   @r,   rd  rd    rS  r+   rd  c                        e Zd Z fdZ	 	 d
dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZd	 Z xZS )AlignTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y r   )
rp   rq   chunk_size_feed_forwardseq_len_dimrU  	attentionr]  intermediaterd  rW  r|   s     r,   rq   zAlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r+   Nr!   r  r8  r(  r8   c                      | j                   |f||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S rY  )rl  r   feed_forward_chunkrj  rk  )	r?   r!   r  r8  r(  self_attention_outputsr[  rE  layer_outputs	            r,   r   zAlignTextLayer.forward  s     "0"
)/"
 	"
 2!4(,0##T%A%A4CSCSUe
  /G+r+   c                 L    | j                  |      }| j                  ||      }|S r   )rm  rW  )r?   r[  intermediate_outputrq  s       r,   ro  z!AlignTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr+   rF  )r"   r#   r$   rq   r&   r   r'   r   r   r   r)   r   ro  r   r   s   @r,   rh  rh    sl    . 48).	|| ))D0  $;	
 +, 
u||	.r+   rh  c                        e Zd Z fdZe	 	 	 	 ddej                  dej                  dz  dedz  dedz  dedz  de	e
   d	eej                     ez  fd
       Z xZS )AlignTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w rF  )
rp   rq   rS   r   r   r   num_hidden_layersrh  layergradient_checkpointing)r?   rS   r   r}   s      r,   rq   zAlignTextEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#Nr!   r  r8  r   r   r(  r8   c                     |rdnd }|rdnd }t        | j                        D ])  \  }	}
|r||fz   } |
|||fi |}|d   }|s!||d   fz   }+ |r||fz   }t        |||      S )Nr*   r   r   )r    r!   r0   )	enumeraterx  r   )r?   r!   r  r8  r   r   r(  r   all_self_attentionsr   layer_modulelayer_outputss               r,   r   zAlignTextEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!(! 	M *!,M &9]1=M<O&O#	P   1]4D D++*
 	
r+   )NFFT)r"   r#   r$   rq   r   r&   r   r'   r   r   r   r)   r   r   r   r   s   @r,   ru  ru    s    ,  48).,1#'"
||"
 ))D0"
  $;	"

 #Tk"
 D["
 +,"
 
u||		."
 "
r+   ru  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rp   rq   r   r5  r  rL  Tanhr{   r|   s     r,   rq   zAlignTextPooler.__init__%  s9    YYv1163E3EF
'')r+   r!   r8   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )rL  r{   )r?   r!   first_token_tensorpooled_outputs       r,   r   zAlignTextPooler.forward*  s6     +1a40

#566r+   rR  r   s   @r,   r  r  $  s#    $
U\\ ell r+   r  c                   l    e Zd ZU eed<   dZdZdZ ej                         de
j                  fd       Zy)AlignPreTrainedModelrS   align)imagetextTr  c                 "   | j                   j                  }t        |t        j                  t        j
                  f      rPt        j                  |j                  d|       |j                  Ct        j                  |j                         n"t        |t              rt        j                  |j                  j                         t        j                  |j                  j                         t        j                  |j                  | j                   j                          nt        |t        j"                        rqt        j                  |j                  d|       |j$                  Ct'        |j                  dd      s,t        j                  |j                  |j$                            t        |t        j(                  t        j*                  f      rt        j                  |j                         t        j,                  |j                         t'        |dd      ^t        j                  |j.                         t        j,                  |j0                         t        j                  |j2                         yyt        |t4              ryt        j6                  |j8                  t;        j<                  |j8                  j>                  d         jA                  d             t        j                  |jB                         yy)	zInitialize the weightsr:  )meanstdN_is_hf_initializedFrunning_meanr   r   )"rS   initializer_rangera   r   r5  rt   initnormal_weightrm   zeros_
AlignModelxavier_uniform_text_projection	constant_temperaturetemperature_init_valuer  r   r;   r
  rv   ones_r  running_varnum_batches_trackedr   copy_r   r&   rJ   r;  r   r   )r?   r  r  s      r,   _init_weightsz"AlignPreTrainedModel._init_weights:  s    kk++fryy"))45LLSc:{{&FKK(
+  !7!7!>!>?KK..334NN6--t{{/Q/QR-LLSc:!!-gfmmMach6iFMM&*<*<=>fr||R^^<=KK$JJv}}%v~t4@F//0

6--.F667 A  34JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 5r+   N)r"   r#   r$   r   r(   base_model_prefixinput_modalitiessupports_gradient_checkpointingr&   no_gradr   Moduler  r*   r+   r,   r  r  3  s?    (&*#U]]_/BII / /r+   r  zJ
    The text model from ALIGN without any head or projection on top.
    c                   B    e Zd ZU eed<   dZdgZddedef fdZd Z	d Z
ee	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dee   deez  fd              Z xZS )AlignTextModelrS   )r  r   add_pooling_layerc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rp   rq   rS   r   r   ru  encoderr  pooler	post_init)r?   rS   r  r}   s      r,   rq   zAlignTextModel.__init__a  sM    
 	 -f5'/1Bof- 	r+   c                 .    | j                   j                  S r   r   r  rB   s    r,   get_input_embeddingsz#AlignTextModel.get_input_embeddingsq  s    ...r+   c                 &    || j                   _        y r   r  )r?   r  s     r,   set_input_embeddingsz#AlignTextModel.set_input_embeddingst  s    */'r+   Nr  r  r   r   r  r8  r   r   r(  r8   c	                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         dd }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  ddd|f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j                  ||||      } | j"                  |f|||d	d
|	}|d   }| j$                  | j%                  |      nd}t'        |||j(                  |j*                        S )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsrE   r   r  )r  r   r   r  T)r  r8  r   r   r   )r    pooler_outputr!   r0   )rS   r8  r   use_return_dictr2  %warn_if_padding_and_no_attention_maskr  rF   r&   onesr  r   r   r   r   r  get_extended_attention_maskr  r  r   r!   r0   )r?   r  r  r   r   r  r8  r   r   r(  r  
batch_sizer  rF   r  r  extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                        r,   r   zAlignTextModel.forwardw  s   : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m??%)'	 + 
 '$,,
2/!5
 
 *!,8<8OO4UY)-')77&11	
 	
r+   T)NNNNNNNN)r"   r#   r$   r   r(   r  _no_split_modulesr   rq   r  r  r   r   r&   r   r   r   r)   r   r   r   r   s   @r,   r  r  W  s      ./ 4  /0  *..2.2,0-1)-,0#'S
<<$&S
 t+S
 t+	S

 llT)S
 ||d*S
  $;S
 #TkS
 D[S
 +,S
 
+	+S
  S
r+   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                        e Zd ZU eed<   dZdZdZdZdgZ	def fdZ
ee	 	 	 ddej                  dz  d	edz  d
edz  dee   deez  f
d              Z xZS )AlignVisionModelrS   r~   )r  Fru   r   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dk(  r't        j                  |j                  d      | _        nN|j                  dk(  r't        j                  |j                  d      | _        nt        d|j                         | j                          y )Nr  T)	ceil_moderY   z2config.pooling must be one of ['mean', 'max'] got )rp   rq   rS   re   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2dr2  poolingr  r|   s     r,   rq   zAlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r+   Nr   r   r(  r8   c                 f   ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  ||d      }|d   }| j                  |      }|j                  |j                  dd       }t        |||j                        S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_valuesT)r   r   r   rV   )r    r  r!   )rS   r   r  r2  r   r  r  r?  r;  r   r!   )	r?   r~   r   r   r(  r  r  r    r  s	            r,   r   zAlignVisionModel.forward  s    @ %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5 ' 
 ,A.$56%--m.A.A"1.EF7/')77
 	
r+   )NNN)r"   r#   r$   r   r(   main_input_namer  r  _input_embed_layerr  rq   r   r   r&   r'   r   r   r   r)   r   r   r   r   s   @r,   r  r    s     $O!&+#&+,0 "  26,0#'	5
''$.5
 #Tk5
 D[	5

 +,5
 
9	95
  5
r+   r  c                   l    e Zd ZU eed<   def fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e   d
eez  fd              Zeedej                  d	e
e   d
eez  fd              Zee	 	 	 	 	 	 	 	 	 	 ddej"                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  d	e
e   d
eez  fd              Z xZS )r  rS   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        t        |      | _        t        |      | _        t!        j"                  | j                  | j                        | _        t!        j&                  t)        j*                  | j,                  j.                              | _        | j3                          y )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rp   rq   ra   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr  vision_modelr   r5  r  	Parameterr&   tensorrS   r  r  r  )r?   rS   r  r  r}   s       r,   rq   zAlignModel.__init__+  s#    &,,o>++,-Q0 
 &..0AB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r+   Nr  r  r   r   r  r(  r8   c           
           | j                   d|||||dd|}|d   dddddf   }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r  r  r   r   r  r   r   Nr*   )r  r  r  )	r?   r  r  r   r   r  r(  text_outputsr    s	            r,   get_text_featureszAlignModel.get_text_featuresI  sg    2 4C4?? 4
))%'4
 4
 )OAq!G4%)%9%9:K%L"r+   r~   c                 *     | j                   dd|i|S )a}  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```r~   r*   )r  )r?   r~   r(  s      r,   get_image_featureszAlignModel.get_image_featuresp  s    . !t  ElEfEEr+   return_lossr8  r   r   c           
      b   ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
| j	                  ||	d      }| j                  |||||||	d      }|d   }|d   dddddf   }| j                  |      }||j                  ddd	      z  }||j                  ddd	      z  }t        j                  ||j                               | j                  z  }|j                         }d}|rt        |      }t        |||||||
      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NT)r~   r   r   )r  r  r   r   r  r8  r   r   r   r   rV   r   )r   r   keepdim)r3   r4   r5   r/   r   r6   r7   )rS   r8  r   r  r  r  r  normr&   r"  rO   r  rR   r2   )r?   r  r~   r  r   r   r  r  r8  r   r   r(  vision_outputsr  r   r/   r5   r4   r3   s                      r,   r   zAlignModel.forward  so   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%!5 + 
 ))%'/!5 ' 	
 &a("1oaAg.**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4DEHXHXX*,,.o.D-+#%* .
 	
r+   )NNNNN)
NNNNNNNNNN)r"   r#   r$   r   r(   rq   r   r   r&   r   r   r   r)   r   r  r'   r  r  r   r2   r   r   r   s   @r,   r  r  '  s    { <  *..2.2,0-1#<<$&# t+# t+	#
 llT)# ||d*# +,# 
+	+#  #J F!--F9?@R9SF	+	+F  F.  .215.2.2,0-1#')-,0#'X
##d*X
 ''$.X
 t+	X

 t+X
 llT)X
 ||d*X
 D[X
  $;X
 #TkX
 D[X
 +,X
 
	X
  X
r+   r  )r  r  r  r  r  )r:  )Or%   r   collections.abcr   dataclassesr   typingr   r&   r    r   r  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   configuration_alignr   r   r   
get_loggerr"   loggerr   r.   r2   r   rL   rR   rZ   r]   r)   r   rc   r  re   rt   r   r   r   r   r   r   r   r   r   r+  r-  rI  rU  r]  rd  rh  ru  r  r  r  r  r  __all__r*   r+   r,   <module>r     sW     $ !    & ! 9  G & 6 _ _ P P 
		H	% 
:[ : : 
	7; 	7 	7  
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @S5[ @$ @*BII 4
 
6		 6$		 $P$BII $N BNryy NbG
 G
T9")) 9F %II%<<% 
% <<	%
 LL4'% % %,5RYY 5r"))  2BII  bii #/ #L*
ryy *
\bii   /?  /  /F 
p
) p

p
f 
P
+ P

P
f {
% {
 {
| Wr+   