
    iT^              	          d Z ddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ  ej0                  e      Ze ed       G d de                    Zd:dej8                  dededej8                  fdZ G d dej@                        Z! G d dej@                        Z" G d dej@                        Z# G d dej@                        Z$ G d dej@                        Z% G d  d!ej@                        Z& G d" d#ej@                        Z' G d$ d%ej@                        Z( G d& d'ej@                        Z) G d( d)ej@                        Z* G d* d+ej@                        Z+ G d, d-ej@                        Z, G d. d/ej@                        Z- G d0 d1ej@                        Z.e G d2 d3e             Z/e G d4 d5e/             Z0 ed6       G d7 d8e/             Z1g d9Z2y);zPyTorch CvT model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel)auto_docstringlogging   )	CvtConfigzV
    Base class for model's outputs, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   y)BaseModelOutputWithCLSTokenz
    cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
        Classification token at the output of the last layer of the model.
    Nlast_hidden_statecls_token_value.hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tuple     `/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_cvt.pyr   r   !   sS    
 37u((4/604OU&&-4:>M5**C/047>r    r   input	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   )dtypedevice)shapendimr   randr(   r)   floor_div)r"   r#   r$   	keep_probr*   random_tensoroutputs          r!   	drop_pathr2   3   s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr    c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
CvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr#   r%   c                 0    t         |           || _        y N)super__init__r#   )selfr#   	__class__s     r!   r8   zCvtDropPath.__init__F   s    "r    r   c                 D    t        || j                  | j                        S r6   )r2   r#   r$   )r9   r   s     r!   forwardzCvtDropPath.forwardJ   s    FFr    c                      d| j                    S )Nzp=r#   )r9   s    r!   
extra_reprzCvtDropPath.extra_reprM   s    DNN#$$r    r6   )r   r   r   r   floatr8   r   Tensorr<   strr?   __classcell__r:   s   @r!   r4   r4   C   sG    b#%$, #$ #GU\\ Gell G%C %r    r4   c                   (     e Zd ZdZ fdZd Z xZS )CvtEmbeddingsz'
    Construct the CvT embeddings.
    c                     t         |           t        |||||      | _        t	        j
                  |      | _        y )N)
patch_sizenum_channels	embed_dimstridepadding)r7   r8   CvtConvEmbeddingsconvolution_embeddingsr   Dropoutdropout)r9   rH   rI   rJ   rK   rL   dropout_rater:   s          r!   r8   zCvtEmbeddings.__init__V   s:    &7!	Z`jq'
# zz,/r    c                 J    | j                  |      }| j                  |      }|S r6   )rN   rP   )r9   pixel_valueshidden_states      r!   r<   zCvtEmbeddings.forward]   s&    22<@||L1r    r   r   r   r   r8   r<   rC   rD   s   @r!   rF   rF   Q   s    0r    rF   c                   (     e Zd ZdZ fdZd Z xZS )rM   z"
    Image to Conv Embedding.
    c                     t         |           t        |t        j                  j
                        r|n||f}|| _        t        j                  |||||      | _	        t        j                  |      | _        y )N)kernel_sizerK   rL   )r7   r8   
isinstancecollectionsabcIterablerH   r   Conv2d
projection	LayerNormnormalization)r9   rH   rI   rJ   rK   rL   r:   s         r!   r8   zCvtConvEmbeddings.__init__h   sa    #-j+//:R:R#SZZdfpYq
$))L)\blst\\)4r    c                     | j                  |      }|j                  \  }}}}||z  }|j                  |||      j                  ddd      }| j                  r| j	                  |      }|j                  ddd      j                  ||||      }|S Nr      r   )r^   r*   viewpermuter`   )r9   rS   
batch_sizerI   heightwidthhidden_sizes          r!   r<   zCvtConvEmbeddings.forwardo   s    |42>2D2D/
L&%un#((\;OWWXY[\^_`--l;L#++Aq!499*lTZ\abr    rU   rD   s   @r!   rM   rM   c   s    5
r    rM   c                   $     e Zd Z fdZd Z xZS )CvtSelfAttentionConvProjectionc           	          t         |           t        j                  |||||d|      | _        t        j
                  |      | _        y )NF)rX   rL   rK   biasgroups)r7   r8   r   r]   convolutionBatchNorm2dr`   )r9   rJ   rX   rL   rK   r:   s        r!   r8   z'CvtSelfAttentionConvProjection.__init__}   sG    99#
  ^^I6r    c                 J    | j                  |      }| j                  |      }|S r6   )ro   r`   r9   rT   s     r!   r<   z&CvtSelfAttentionConvProjection.forward   s(    ''5)),7r    r   r   r   r8   r<   rC   rD   s   @r!   rk   rk   |   s    7r    rk   c                       e Zd Zd Zy) CvtSelfAttentionLinearProjectionc                 z    |j                   \  }}}}||z  }|j                  |||      j                  ddd      }|S rb   )r*   rd   re   )r9   rT   rf   rI   rg   rh   ri   s          r!   r<   z(CvtSelfAttentionLinearProjection.forward   sK    2>2D2D/
L&%un#((\;OWWXY[\^_`r    N)r   r   r   r<   r   r    r!   ru   ru      s    r    ru   c                   &     e Zd Zd fd	Zd Z xZS )CvtSelfAttentionProjectionc                 p    t         |           |dk(  rt        ||||      | _        t	               | _        y )Ndw_bn)r7   r8   rk   convolution_projectionru   linear_projection)r9   rJ   rX   rL   rK   projection_methodr:   s         r!   r8   z#CvtSelfAttentionProjection.__init__   s7    '*HT_ahjp*qD'!A!Cr    c                 J    | j                  |      }| j                  |      }|S r6   )r{   r|   rr   s     r!   r<   z"CvtSelfAttentionProjection.forward   s(    22<@--l;r    )rz   rs   rD   s   @r!   rx   rx      s    Dr    rx   c                   .     e Zd Z	 d fd	Zd Zd Z xZS )CvtSelfAttentionc                    t         |           |dz  | _        || _        || _        || _        t        |||||dk(  rdn|      | _        t        |||||      | _        t        |||||      | _	        t        j                  |||	      | _        t        j                  |||	      | _        t        j                  |||	      | _        t        j                  |
      | _        y )Ng      avglinear)r}   )rm   )r7   r8   scalewith_cls_tokenrJ   	num_headsrx   convolution_projection_queryconvolution_projection_keyconvolution_projection_valuer   Linearprojection_queryprojection_keyprojection_valuerO   rP   )r9   r   rJ   rX   	padding_q
padding_kvstride_q	stride_kvqkv_projection_methodqkv_biasattention_drop_rater   kwargsr:   s                r!   r8   zCvtSelfAttention.__init__   s     	_
,"",F*?5*HhNc-
) +E{J	Mb+
' -G{J	Mb-
) !#		)YX N ii	98L "		)YX Nzz"56r    c                     |j                   \  }}}| j                  | j                  z  }|j                  ||| j                  |      j	                  dddd      S )Nr   rc   r   r   )r*   rJ   r   rd   re   )r9   rT   rf   ri   _head_dims         r!   "rearrange_for_multi_head_attentionz3CvtSelfAttention.rearrange_for_multi_head_attention   sV    %1%7%7"
K>>T^^3  [$..(S[[\]_`bcefggr    c                 `   | j                   rt        j                  |d||z  gd      \  }}|j                  \  }}}|j	                  ddd      j                  ||||      }| j                  |      }| j                  |      }	| j                  |      }
| j                   rKt        j                  |	fd      }	t        j                  ||fd      }t        j                  ||
fd      }
| j                  | j                  z  }| j                  | j                  |	            }	| j                  | j                  |            }| j                  | j                  |
            }
t        j                   d|	|g      | j"                  z  }t        j$                  j&                  j)                  |d      }| j+                  |      }t        j                   d||
g      }|j                  \  }}}}|j	                  dddd      j-                         j                  ||| j                  |z        }|S )	Nr   r   rc   dimzbhlk,bhtk->bhltzbhlt,bhtv->bhlvr   )r   r   splitr*   re   rd   r   r   r   catrJ   r   r   r   r   r   einsumr   r   
functionalsoftmaxrP   
contiguous)r9   rT   rg   rh   	cls_tokenrf   ri   rI   keyqueryvaluer   attention_scoreattention_probscontextr   s                   r!   r<   zCvtSelfAttention.forward   s   &+kk,FUN@SUV&W#I|0<0B0B-
K#++Aq!499*lTZ\ab--l;11,?11,?IIy%0a8E))Y,!4CIIy%0a8E>>T^^3778M8Me8TU55d6I6I#6NO778M8Me8TU,,'85#,G$**T((--55o25N,,7,,0?E2JK&}}1k1//!Q1-88:??
KY]YgYgjrYrsr    T)r   r   r   r8   r   r<   rC   rD   s   @r!   r   r      s     '7Rhr    r   c                   (     e Zd ZdZ fdZd Z xZS )CvtSelfOutputz
    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    c                     t         |           t        j                  ||      | _        t        j
                  |      | _        y r6   )r7   r8   r   r   denserO   rP   )r9   rJ   	drop_rater:   s      r!   r8   zCvtSelfOutput.__init__   s0    YYy)4
zz),r    c                 J    | j                  |      }| j                  |      }|S r6   r   rP   r9   rT   input_tensors      r!   r<   zCvtSelfOutput.forward  s$    zz,/||L1r    rU   rD   s   @r!   r   r      s    
-
r    r   c                   (     e Zd Z	 d fd	Zd Z xZS )CvtAttentionc                 x    t         |           t        |||||||||	|
|      | _        t	        ||      | _        y r6   )r7   r8   r   	attentionr   r1   )r9   r   rJ   rX   r   r   r   r   r   r   r   r   r   r:   s                r!   r8   zCvtAttention.__init__	  sM     	)!
 $Iy9r    c                 P    | j                  |||      }| j                  ||      }|S r6   )r   r1   )r9   rT   rg   rh   self_outputattention_outputs         r!   r<   zCvtAttention.forward(  s+    nn\65A;;{LAr    r   rs   rD   s   @r!   r   r     s     :> r    r   c                   $     e Zd Z fdZd Z xZS )CvtIntermediatec                     t         |           t        j                  |t	        ||z              | _        t        j                         | _        y r6   )r7   r8   r   r   intr   GELU
activation)r9   rJ   	mlp_ratior:   s      r!   r8   zCvtIntermediate.__init__/  s7    YYy#i).C*DE
'')r    c                 J    | j                  |      }| j                  |      }|S r6   )r   r   rr   s     r!   r<   zCvtIntermediate.forward4  s$    zz,/|4r    rs   rD   s   @r!   r   r   .  s    $
r    r   c                   $     e Zd Z fdZd Z xZS )	CvtOutputc                     t         |           t        j                  t	        ||z        |      | _        t        j                  |      | _        y r6   )r7   r8   r   r   r   r   rO   rP   )r9   rJ   r   r   r:   s       r!   r8   zCvtOutput.__init__;  s:    YYs9y#899E
zz),r    c                 T    | j                  |      }| j                  |      }||z   }|S r6   r   r   s      r!   r<   zCvtOutput.forward@  s.    zz,/||L1#l2r    rs   rD   s   @r!   r   r   :  s    -
r    r   c                   ,     e Zd ZdZ	 d fd	Zd Z xZS )CvtLayerzb
    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
    c                 Z   t         |           t        |||||||||	|
||      | _        t	        ||      | _        t        |||      | _        |dkD  rt        |      nt        j                         | _        t        j                  |      | _        t        j                  |      | _        y )Nr'   r>   )r7   r8   r   r   r   intermediater   r1   r4   r   Identityr2   r_   layernorm_beforelayernorm_after)r9   r   rJ   rX   r   r   r   r   r   r   r   r   r   drop_path_rater   r:   s                  r!   r8   zCvtLayer.__init__L  s    " 	%!
 ,IyA	9i@BPSVBV~>\^\g\g\i "Y 7!||I6r    c                    | j                  | j                  |      ||      }|}| j                  |      }||z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  |      }|S r6   )r   r   r2   r   r   r1   )r9   rT   rg   rh   self_attention_outputr   layer_outputs          r!   r<   zCvtLayer.forwards  s     $!!,/!

 1>>*:; (,6 ++L9((6 {{<>~~l3r    r   rU   rD   s   @r!   r   r   G  s    & %7Nr    r   c                   $     e Zd Z fdZd Z xZS )CvtStagec                 |   t         |           || _        || _        | j                  j                  | j                     rFt        j                  t        j                  dd| j                  j                  d               | _        t        |j                  | j                     |j                  | j                     | j                  dk(  r|j                  n|j                  | j                  dz
     |j                  | j                     |j                  | j                     |j                  | j                           | _        t        j"                  d|j$                  | j                     |j&                  |   d      D cg c]  }|j)                          }}t        j*                  t-        |j&                  | j                           D cg c]T  }t/        |j0                  | j                     |j                  | j                     |j2                  | j                     |j4                  | j                     |j6                  | j                     |j8                  | j                     |j:                  | j                     |j<                  | j                     |j>                  | j                     |j@                  | j                     |j                  | j                     || j                     |jB                  | j                     |j                  | j                           W c} | _"        y c c}w c c}w )Nr   r   r   )rH   rK   rI   rJ   rL   rQ   cpu)r)   )r   rJ   rX   r   r   r   r   r   r   r   r   r   r   r   )#r7   r8   configstager   r   	Parameterr   randnrJ   rF   patch_sizespatch_striderI   patch_paddingr   	embeddinglinspacer   depthitem
Sequentialranger   r   
kernel_qkvr   r   r   r   r   r   r   r   layers)r9   r   r   xdrop_path_ratesr   r:   s         r!   r8   zCvtStage.__init__  s   
;;  ,\\%++aDKK<Q<QRT<U*VWDN&))$**5&&tzz204

a,,VEUEUVZV`V`cdVdEe&&tzz2((4))$**5
 $nnQ0E0Edjj0QSYS_S_`eSfotu
AFFH
 
 mm$ v||DJJ78#" ! $..tzz:$..tzz: & 1 1$** =$..tzz:%00<$..tzz:#__TZZ8*0*F*Ftzz*R#__TZZ8(.(B(B4::(N$..tzz:#24::#>$..tzz:#)#3#3DJJ#?
	

s   L4EL9c                 Z   d }| j                  |      }|j                  \  }}}}|j                  ||||z        j                  ddd      }| j                  j
                  | j                     r6| j
                  j                  |dd      }t        j                  ||fd      }| j                  D ]  } ||||      }|} | j                  j
                  | j                     rt        j                  |d||z  gd      \  }}|j                  ddd      j                  ||||      }||fS )Nr   rc   r   r   r   )r   r*   rd   re   r   r   r   expandr   r   r   r   )	r9   rT   r   rf   rI   rg   rh   layerlayer_outputss	            r!   r<   zCvtStage.forward  s'   	~~l32>2D2D/
L&%#((\6E>RZZ[\^_abc;;  ,--j"bAI 99i%>AFL[[ 	)E!,>M(L	) ;;  ,&+kk,FUN@SUV&W#I|#++Aq!499*lTZ\abY&&r    rs   rD   s   @r!   r   r     s    (
T'r    r   c                   &     e Zd Z fdZddZ xZS )
CvtEncoderc                     t         |           || _        t        j                  g       | _        t        t        |j                              D ]'  }| j
                  j                  t        ||             ) y r6   )r7   r8   r   r   
ModuleListstagesr   lenr   appendr   )r9   r   	stage_idxr:   s      r!   r8   zCvtEncoder.__init__  s[    mmB's6<<01 	<IKKx	:;	<r    c                     |rdnd }|}d }t        | j                        D ]  \  }} ||      \  }}|s||fz   } |st        d |||fD              S t        |||      S )Nr   c              3   &   K   | ]	  }||  y wr6   r   ).0vs     r!   	<genexpr>z%CvtEncoder.forward.<locals>.<genexpr>  s     bqTUTabs   r   r   r   )	enumerater   r   r   )	r9   rS   output_hidden_statesreturn_dictall_hidden_statesrT   r   r   stage_modules	            r!   r<   zCvtEncoder.forward  s    "6BD#	!*4;;!7 	HA&2<&@#L)#$5$G!	H
 b\9>O$Pbbb**%+
 	
r    )FTrs   rD   s   @r!   r   r     s    <
r    r   c                   T    e Zd ZU eed<   dZdZdgZ ej                         d        Z
y)CvtPreTrainedModelr   cvtrS   r   c                    t        |t        j                  t        j                  f      rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyt        |t        j                  t        j                  f      rt	        j                  |j                         t	        j                  |j                         t        |dd      ^t	        j                  |j                         t	        j                  |j                          t	        j                  |j"                         yyt        |t$              r[| j                  j&                  |j(                     r7t	        j
                  |j&                  d| j                  j                         yyy)zInitialize the weightsr'   )meanstdNrunning_mean)rY   r   r   r]   inittrunc_normal_weightr   initializer_rangerm   zeros_r_   rp   ones_getattrr  running_varnum_batches_trackedr   r   r   )r9   modules     r!   _init_weightsz CvtPreTrainedModel._init_weights  s*    fryy"))45v}}3DKK<Y<YZ{{&FKK( 'r~~ >?KK$JJv}}%v~t4@F//0

6--.F667 A ){{$$V\\2""6#3#3#4;;C`C`a 3 *r    N)r   r   r   r   r   base_model_prefixmain_input_name_no_split_modulesr   no_gradr  r   r    r!   r   r     s7    $O#U]]_b br    r   c                   v     e Zd Zd fd	Ze	 	 	 d	dej                  dz  dedz  dedz  dee	z  fd       Z
 xZS )
CvtModelc                 r    t         |   |       || _        t        |      | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r7   r8   r   r   encoder	post_init)r9   r   add_pooling_layerr:   s      r!   r8   zCvtModel.__init__   s/    
 	 !&)r    NrS   r   r   r%   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |||      }|d   }|s	|f|dd  z   S t        ||j                  |j                        S )Nz You have to specify pixel_valuesr   r   r   r   r   )r   r   use_return_dict
ValueErrorr  r   r   r   )r9   rS   r   r   r   encoder_outputssequence_outputs          r!   r<   zCvtModel.forward
  s     %9$D $++JjJj 	 &1%<k$++B]B]?@@,,!5# ' 

 *!,#%(;;;*-+;;)77
 	
r    r   )NNN)r   r   r   r8   r   r   rA   boolr   r   r<   rC   rD   s   @r!   r  r    sd      -1,0#'	
llT)
 #Tk
 D[	
 
,	,
 
r    r  z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                        e Zd Z fdZe	 	 	 	 d	dej                  dz  dej                  dz  dedz  dedz  dee	z  f
d       Z
 xZS )
CvtForImageClassificationc                    t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  d         | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _        | j                          y )NF)r  r   r   )r7   r8   
num_labelsr  r  r   r_   rJ   	layernormr   r   
classifierr  )r9   r   r:   s     r!   r8   z"CvtForImageClassification.__init__2  s      ++Fe<f&6&6r&:; CIBSBSVWBWBIIf&&r*F,=,=>]_]h]h]j 	
 	r    NrS   labelsr   r   r%   c                 b   ||n| j                   j                  }| j                  |||      }|d   }|d   }| j                   j                  d   r| j	                  |      }nI|j
                  \  }	}
}}|j                  |	|
||z        j                  ddd      }| j	                  |      }|j                  d      }| j                  |      }d}|| j                   j                  | j                   j                  dk(  rd| j                   _
        nv| j                   j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd	| j                   _
        nd
| j                   _
        | j                   j                  dk(  rSt!               }| j                   j                  dk(  r& ||j#                         |j#                               }n |||      }n| j                   j                  d	k(  rGt%               } ||j                  d| j                   j                        |j                  d            }n,| j                   j                  d
k(  rt'               } |||      }|s|f|dd z   }||f|z   S |S t)        |||j*                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r   rc   r   
regressionsingle_label_classificationmulti_label_classification)losslogitsr   )r   r  r  r   r&  r*   rd   re   r  r'  problem_typer%  r(   r   longr   r   squeezer   r   r
   r   )r9   rS   r(  r   r   r   outputsr   r   rf   rI   rg   rh   sequence_output_meanr.  r-  loss_fctr1   s                     r!   r<   z!CvtForImageClassification.forward@  s`    &1%<k$++B]B]((!5#  
 "!*AJ	;;  $"nnY7O6E6K6K3Jfe-22:|VV[^\ddefhiklmO"nn_=O.333:!56{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE3f\c\q\qrrr    )NNNN)r   r   r   r8   r   r   rA   r!  r   r
   r<   rC   rD   s   @r!   r#  r#  +  s      -1&*,0#'=sllT)=s t#=s #Tk	=s
 D[=s 
5	5=s =sr    r#  )r#  r  r   )r'   F)3r   collections.abcrZ   dataclassesr   r   r   torch.nnr   r   r    r	   r  modeling_outputsr
   r   modeling_utilsr   utilsr   r   configuration_cvtr   
get_loggerr   loggerr   rA   r@   r!  r2   Moduler4   rF   rM   rk   ru   rx   r   r   r   r   r   r   r   r   r   r  r#  __all__r   r    r!   <module>rA     s     !   A A & Q - , ( 
		H	% 
?+ ? ?U\\ e T V[VbVb  %")) %BII $		 2RYY (ryy 
 
Nryy NbBII "# 299 # L	bii 	
		 
?ryy ?D<'ryy <'~
 
8 b b b2 )
! )
 )
X Ms 2 MsMs` Jr    