
    i              	          d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej0                  e      Zd<dedededz  defdZ ed       ed      fdedededefdZ G d dej>                        Z  G d dej>                        Z! G d dej>                        Z" G d dej>                        Z# G d  d!ej>                        Z$ G d" d#ej>                        Z% G d$ d%ej>                        Z& G d& d'e      Z' G d( d)ej>                        Z(e G d* d+e             Z)e G d, d-e)             Z* ed./       G d0 d1e)             Z+ G d2 d3ej>                        Z, G d4 d5ej>                        Z- G d6 d7ej>                        Z. ed8/       G d9 d:e)             Z/g d;Z0y)=zPyTorch MobileViTV2 model.    N)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging   )MobileViTV2Configvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )zU
    Ensure that all layers have a channel count that is divisible by `divisor`.
       g?)maxint)r   r   r   	new_values       p/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibler   (   sS     	Is57Q;#677BWLMI3;W	y>    z-infinfmin_valmax_valc                 .    t        |t        ||             S N)r   minr   r   r    s      r   clipr%   5   s    wGU+,,r   c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eez  ddf fdZdej                  dej                  fdZ
 xZS )MobileViTV2ConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r)   r*   r+   r,   paddingr/   r-   r.   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r4   	__class__s               r   r<   zMobileViTV2ConvLayer.__init__;   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S r"   )r?   rA   rD   )rF   rH   s     r   forwardzMobileViTV2ConvLayer.forwardq   sK    ##H-)))(3H??&x0Hr   )r   r   Fr   TT)__name__
__module____qualname__r   r   boolrC   r<   torchTensorrJ   __classcell__rG   s   @r   r'   r'   :   s     "&%)4#!4# 4# 	4#
 4# 4# 4# 4# 4#  4# s
4# 
4#l  r   r'   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTV2InvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r(   r)   r*   r,   r/   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )N   )r   r   zInvalid stride .r   )r)   r*   r+   r   )r)   r*   r+   r,   r-   r/   Fr)   r*   r+   r1   )r;   r<   r   r   roundexpand_ratior=   use_residualr'   
expand_1x1conv_3x3
reduce_1x1)rF   r(   r)   r*   r,   r/   expanded_channelsrG   s          r   r<   z$MobileViTV2InvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J.:KYZ
 -)*$
 /)% 
r   rH   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S r"   )r\   r]   r^   r[   )rF   rH   residuals      r   rJ   z#MobileViTV2InvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr   )r   rK   rL   rM   __doc__r   r   r<   rO   rP   rJ   rQ   rR   s   @r   rT   rT   {   sc    
 lm
'
69
IL
VY
eh
	
BF F Fr   rT   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTV2MobileNetLayerr(   r)   r*   r,   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r)   r*   r,   )r;   r<   r   
ModuleListlayerrangerT   append)	rF   r(   r)   r*   r,   rf   iri   rG   s	           r   r<   z"MobileViTV2MobileNetLayer.__init__   sh     	]]_
z" 	'A/')!"avQ	E JJe$&K	'r   rH   c                 8    | j                   D ]
  } ||      } |S r"   ri   )rF   rH   layer_modules      r   rJ   z!MobileViTV2MobileNetLayer.forward   s$     JJ 	.L#H-H	.r   )r   r   
rK   rL   rM   r   r   r<   rO   rP   rJ   rQ   rR   s   @r   re   re      sV    qr'''69'IL'VY'kn'	'   r   re   c                   h     e Zd ZdZdededdf fdZdej                  dej                  fdZ	 xZ
S )	MobileViTV2LinearSelfAttentionay  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://huggingface.co/papers/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r(   	embed_dimr   Nc           	          t         |           t        ||dd|z  z   dddd      | _        t	        j
                  |j                        | _        t        |||dddd      | _        || _        y )Nr   r   TF)r(   r)   r*   r.   r+   r0   r1   p)	r;   r<   r'   qkv_projr   Dropoutattn_dropoutout_projrs   )rF   r(   rs   rG   s      r   r<   z'MobileViTV2LinearSelfAttention.__init__   s{    ,!a)m,# 
 JJ)<)<=,!"# 
 #r   hidden_statesc                    | j                  |      }t        j                  |d| j                  | j                  gd      \  }}}t        j                  j
                  j                  |d      }| j                  |      }||z  }t        j                  |dd      }t        j                  j
                  j                  |      |j                  |      z  }| j                  |      }|S )Nr   )split_size_or_sectionsdimr~   Tr~   keepdim)rw   rO   splitrs   r   
functionalsoftmaxry   sumrelu	expand_asrz   )	rF   r{   qkvquerykeyr   context_scorescontext_vectorouts	            r   rJ   z&MobileViTV2LinearSelfAttention.forward   s    mmM*
 "KKQX\XfXfDgmnosE ,,44U4C**>: ~->r4H hh!!&&u-0H0H0OOmmC 
r   rb   rR   s   @r   rr   rr      s>    	#0 #S #T #2U\\ ell r   rr   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2FFNr(   rs   ffn_latent_dimffn_dropoutr   Nc           
          t         |           t        |||ddddd      | _        t	        j
                  |      | _        t        |||ddddd      | _        t	        j
                  |      | _        y )Nr   TF)r(   r)   r*   r+   r,   r.   r0   r1   )	r;   r<   r'   conv1r   rx   dropout1conv2dropout2)rF   r(   rs   r   r   rG   s        r   r<   zMobileViTV2FFN.__init__  s|     	)!'#	

 

;/)&"# 	

 

;/r   r{   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r"   )r   r   r   r   )rF   r{   s     r   rJ   zMobileViTV2FFN.forward#  s@    

=1m4

=1m4r           rK   rL   rM   r   r   floatr<   rO   rP   rJ   rQ   rR   s   @r   r   r     sY     !0!0 0 	0
 0 
0@U\\ ell r   r   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2TransformerLayerr(   rs   r   dropoutr   Nc                 P   t         |           t        j                  d||j                        | _        t        ||      | _        t        j                  |      | _	        t        j                  d||j                        | _
        t        ||||j                        | _        y )Nr   
num_groupsnum_channelsr7   ru   )r;   r<   r   	GroupNormlayer_norm_epslayernorm_beforerr   	attentionrx   r   layernorm_afterr   r   ffn)rF   r(   rs   r   r   rG   s        r   r<   z$MobileViTV2TransformerLayer.__init__,  s~     	 "	W]WlWl m7	J

W-!||qyV\VkVkl!&)^VEWEWXr   r{   c                     | j                  |      }| j                  |      }||z   }| j                  |      }| j                  |      }||z   }|S r"   )r   r   r   r   )rF   r{   layernorm_1_outattention_outputlayer_outputs        r   rJ   z#MobileViTV2TransformerLayer.forward:  sY    //>>>/:(=8++M:xx-#m3r   r   r   rR   s   @r   r   r   +  s^     Y!Y Y 	Y
 Y 
Y	U\\ 	ell 	r   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2Transformerr(   n_layersd_modelr   Nc                 <   t         	|           |j                  }||z  g|z  }|D cg c]  }t        |dz  dz         }}t	        j
                         | _        t        |      D ].  }t        ||||         }| j                  j                  |       0 y c c}w )N   )rs   r   )
r;   r<   ffn_multiplierr   r   rh   ri   rj   r   rk   )
rF   r(   r   r   r   ffn_dimsd	block_idxtransformer_layerrG   s
            r   r<   zMobileViTV2Transformer.__init__G  s    .."W,-8 2::ACbB'::]]_
x 	1I ;'(9:M! JJ/0		1 ;s   Br{   c                 8    | j                   D ]
  } ||      } |S r"   rn   )rF   r{   ro   s      r   rJ   zMobileViTV2Transformer.forwardX  s%     JJ 	8L(7M	8r   rp   rR   s   @r   r   r   F  sA    10 1C 1# 1RV 1"U\\ ell r   r   c                       e Zd ZdZ	 	 	 ddededededededed	d
f fdZdej                  d	e	ej                  e	eef   f   fdZ
dej                  de	eef   d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTV2LayerzE
    MobileViTV2 layer: https://huggingface.co/papers/2206.02680
    r(   r)   r*   attn_unit_dimn_attn_blocksr/   r,   r   Nc                    t         	|           |j                  | _        |j                  | _        |}|dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                  |      | _	        t        |||ddd      | _
        t        |||      | _        t        j                  d||j                        | _        t        |||dd	d      | _        y )
Nr   r   )r)   r*   r,   r/   )r)   r*   r+   r-   F)r)   r*   r+   r0   r1   )r   r   r   T)r;   r<   
patch_sizepatch_widthpatch_heightrT   downsampling_layerr'   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)
rF   r(   r)   r*   r   r   r/   r,   cnn_out_dimrG   s
            r   r<   zMobileViTV2Layer.__init__c  s    	!,,"--#Q;&A')!)QvA*2Q,QA'D# 'K&*D# -#$//
 -#$# 
 2&-Zgh TZTiTij  4#$"  
r   feature_mapc                 "   |j                   \  }}}}t        j                  j                  || j                  | j
                  f| j                  | j
                  f      }|j                  ||| j                  | j
                  z  d      }|||ffS )N)r+   r,   r   )shaper   r   unfoldr   r   reshape)rF   r   
batch_sizer)   
img_height	img_widthpatchess          r   	unfoldingzMobileViTV2Layer.unfolding  s    9D9J9J6
KY--&&**D,<,<=%%t'7'78 ' 

 //*k4;L;LtO_O_;_acdY///r   r   output_sizec                     |j                   \  }}}}|j                  |||z  |      }t        j                  j	                  ||| j
                  | j                  f| j
                  | j                  f      }|S )N)r   r+   r,   )r   r   r   r   foldr   r   )rF   r   r   r   in_dimr   	n_patchesr   s           r   foldingzMobileViTV2Layer.folding  sz    4;MM1
FJ	//*fz.A9Mmm((#**D,<,<=%%t'7'78	 ) 
 r   rH   c                 6   | j                   r| j                  |      }| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }|S r"   )r   r   r   r   r   r   r   r   )rF   rH   r   r   s       r   rJ   zMobileViTV2Layer.forward  s    ""..x8H ==*==*  $~~h7 ""7+..) <<5''1r   )r   r   r   )rK   rL   rM   rc   r   r   r<   rO   rP   tupler   r   rJ   rQ   rR   s   @r   r   r   ^  s     ;
!;
 ;
 	;

 ;
 ;
 ;
 ;
 
;
z	0U\\ 	0eELL%PSUXPX/<Y6Z 	0u|| %S/ ell   r   r   c            
       `     e Zd Zdeddf fdZ	 	 d	dej                  dededee	z  fdZ
 xZS )
MobileViTV2Encoderr(   r   Nc           	         t         |           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        t        d|j                  z  dd      dd	      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }	t        d|j                  z  d
      }
t        |||dd      }| j
                  j                  |       t        |||dd      }| j
                  j                  |       t        |||t        |j                  d   |j                  z  d
      |j                  d         }| j
                  j                  |       |r|dz  }t        |||	t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       |r|dz  }t        ||	|
t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       y )NFrV   Tr   r       @   r$   r   r   r         i     )r)   r*   r,   rf   r   r   )r)   r*   r   r   )r)   r*   r   r   r/   )r;   r<   r(   r   rh   ri   gradient_checkpointingoutput_strider   r%   width_multiplierre   rk   r   base_attn_unit_dimsr   )rF   r(   dilate_layer_4dilate_layer_5r/   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rG   s                   r   r<   zMobileViTV2Encoder.__init__  s|   ]]_
&+# +0/1$!N!N!!R'!N$rF333RLVWce
 %R&*A*A%A2N$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN+#$
 	

'"+#$
 	

'""#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"r   r{   output_hidden_statesreturn_dictc                     |rdnd }t        | j                        D ]  \  }} ||      }|s||fz   } |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wr"   r   ).0vs     r   	<genexpr>z-MobileViTV2Encoder.forward.<locals>.<genexpr>1  s     Xq!-Xs   )last_hidden_stater{   )	enumerateri   r   r	   )rF   r{   r   r   all_hidden_statesrl   ro   s          r   rJ   zMobileViTV2Encoder.forward"  sq     #7BD(4 	IOA|(7M#$58H$H!		I X]4E$FXXX-]noor   )FT)rK   rL   rM   r   r<   rO   rP   rN   r   r	   rJ   rQ   rR   s   @r   r   r     s]    O#0 O#T O#h &+ 	p||p #p 	p
 
/	/pr   r   c                   z    e Zd ZU eed<   dZdZdZdZdgZ	 e
j                         dej                  dd	fd
       Zy	)MobileViTV2PreTrainedModelr(   mobilevitv2pixel_values)imageTr   moduler   Nc                    t        |t        j                  t        j                  t        j                  f      rt        j                  |j                  d| j                  j                         |j                  t        j                  |j                         t        |dd      ^t        j                  |j                         t        j                  |j                         t        j                  |j                          yyt        |t        j"                        r?t        j                  |j                         t        j                  |j                         yy)zInitialize the weightsr   )meanstdNrunning_mean)rB   r   Linearr>   r@   initnormal_weightr(   initializer_ranger.   zeros_getattrr  ones_running_varnum_batches_trackedr   )rF   r  s     r   _init_weightsz(MobileViTV2PreTrainedModel._init_weights?  s     fryy"))R^^DELLSdkk6S6ST{{&FKK(v~t4@F//0

6--.F667 A -KK$JJv}}% .r   )rK   rL   rM   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesrO   no_gradr   Moduler  r   r   r   r  r  6  sT    %$O!&*#+,U]]_&BII &$ & &r   r  c                        e Zd Zd
dedef fdZe	 	 	 ddej                  dz  dedz  dedz  de	e
z  fd	       Z xZS )MobileViTV2Modelr(   expand_outputc           	         t         |   |       || _        || _        t	        t        d|j                  z  dd      dd      }t        ||j                  |ddd	d	
      | _	        t        |      | _        | j                          y)a  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model. If `True`, the model will output pooled features in addition to
            hidden states. If `False`, only the hidden states will be returned.
        r   r   r   r$   rV   r   r   r   Tr)   r*   r+   r,   r0   r1   N)r;   r<   r(   r!  r   r%   r   r'   r   	conv_stemr   encoder	post_init)rF   r(   r!  r   rG   s       r   r<   zMobileViTV2Model.__init__Q  s     	 *$rF333RLVWce
 .++$"
 *&1 	r   Nr  r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r |d   }t        j                  |ddgd      }n|d   }d }|s|||fn|f}	|	|dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr   r   r   r   Fr   r   )r   pooler_outputr{   )r(   r   use_return_dictr=   r$  r%  r!  rO   r	  r
   r{   )
rF   r  r   r   kwargsembedding_outputencoder_outputsr   pooled_outputoutputs
             r   rJ   zMobileViTV2Model.forwardm  s     %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  / 2 "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r   )T)NNN)rK   rL   rM   r   rN   r<   r   rO   rP   r   r
   rJ   rQ   rR   s   @r   r   r   O  ss    0  8  -1,0#'	(
llT)(
 #Tk(
 D[	(
 
9	9(
 (
r   r   z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
dej                  dz  dedz  dej                  dz  dedz  de	e
z  f
d	       Z xZS )!MobileViTV2ForImageClassificationr(   r   Nc                 L   t         |   |       |j                  | _        t        |      | _        t        d|j                  z  d      }|j                  dkD  r!t        j                  ||j                        nt        j                         | _
        | j                          y )Nr   rV   r   r   )in_featuresout_features)r;   r<   
num_labelsr   r  r   r   r   r  Identity
classifierr&  )rF   r(   r*   rG   s      r   r<   z*MobileViTV2ForImageClassification.__init__  s      +++F3%cF,C,C&CQO   1$ II,V=N=NO 	 	r   r  r   labelsr   c                 B   ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  |      }d}	|| j                  ||| j                         }	|s|f|dd z   }
|	|	f|
z   S |
S t        |	||j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr(  r   r   )losslogitsr{   )r(   r+  r  r*  r9  loss_functionr   r{   )rF   r  r   r:  r   r,  outputsr/  r=  r<  r0  s              r   rJ   z)MobileViTV2ForImageClassification.forward  s     &1%<k$++B]B]""<FZhs"t1<--'!*/%%ffdkkBDY,F)-)9TGf$EvE3!//
 	
r   NNNN)rK   rL   rM   r   r<   r   rO   rP   rN   r   r   rJ   rQ   rR   s   @r   r3  r3    s    0 T "  -1,0&*#'"
llT)"
 #Tk"
 t#	"

 D["
 
5	5"
 "
r   r3  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2ASPPPoolingr(   r)   r*   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )r   Tr   r#  )r;   r<   r   AdaptiveAvgPool2dglobal_poolr'   r   )rF   r(   r)   r*   rG   s       r   r<   zMobileViTV2ASPPPooling.__init__  sB    //A>,#%"!
r   rH   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr)  bilinearFsizemodealign_corners)r   rE  r   r   r   interpolate)rF   rH   spatial_sizes      r   rJ   zMobileViTV2ASPPPooling.forward  sS    ~~bc*##H-==*==,,XLzin,or   rp   rR   s   @r   rB  rB    sB    
0 
s 
RU 
Z^ 
  r   rB  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2ASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r(   r   Nc                    t         |           t        d|j                  z  d      }|}|j                  }t        |j                        dk7  rt        d      t        j                         | _
        t        |||dd      }| j                  j                  |       | j                  j                  |j                  D cg c]  }t        |||d|d	       c}       t        |||      }| j                  j                  |       t        |d
|z  |dd      | _        t        j                   |j"                        | _        y c c}w )Nr   rV   r   r   z"Expected 3 values for atrous_ratesr   r   rX   )r)   r*   r+   r/   r1      ru   )r;   r<   r   r   aspp_out_channelslenatrous_ratesr=   r   rh   convsr'   rk   extendrB  projectrx   aspp_dropout_probr   )	rF   r(   encoder_out_channelsr)   r*   in_projectionrate
pool_layerrG   s	           r   r<   zMobileViTV2ASPP.__init__  s6   -cF4K4K.KUVW*//v""#q(ABB]]_
,#%!
 	

-(

 #//
  % +!- !!#)
	
 ,FKN


*%+L 0|YZkq
 zzF$<$<=)
s   ErH   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S )Nr   r   )rU  rk   rO   catrW  r   )rF   rH   pyramidconvpooled_featuress        r   rJ   zMobileViTV2ASPP.forward!  s\    JJ 	+DNN4>*	+))G+,,w/,,7r   
rK   rL   rM   rc   r   r<   rO   rP   rJ   rQ   rR   s   @r   rO  rO    s8    *>0 *>T *>X  r   rO  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2DeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r(   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r)   r*   r+   r0   r1   r.   )r;   r<   rO  asppr   	Dropout2dclassifier_dropout_probr   r'   rR  r7  r9  rF   r(   rG   s     r   r<   zMobileViTV2DeepLabV3.__init__2  s]    #F+	||F$B$BC.00**# 
r   r{   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )rf  r   r9  )rF   r{   rH   s      r   rJ   zMobileViTV2DeepLabV3.forwardB  s6    99]2./<<)??8,r   rb  rR   s   @r   rd  rd  -  s7    
0 
T 
 U\\ ell r   rd  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
dej                  dz  dej                  dz  dedz  dedz  de	e
z  f
d	       Z xZS )"MobileViTV2ForSemanticSegmentationr(   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r!  )r;   r<   r7  r   r  rd  segmentation_headr&  ri  s     r   r<   z+MobileViTV2ForSemanticSegmentation.__init__O  sE      +++F%H!5f!= 	r   r  r:  r   r   c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}	|Yt        j                  j                  ||j                  dd dd	      }
t        | j                   j                  
      } ||
|      }	|s|r
|f|dd z   }n	|f|dd z   }|	|	f|z   S |S t        |	||r|j                  d      S dd      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr(  r)  rG  FrH  )ignore_indexr   )r<  r=  r{   
attentions)r(   r   r+  r7  r=   r  r{   rn  r   r   rL  r   r   semantic_loss_ignore_indexr   )rF   r  r:  r   r   r,  r?  encoder_hidden_statesr=  r<  upsampled_logitsloss_fctr0  s                r   rJ   z*MobileViTV2ForSemanticSegmentation.forwardY  ss   N %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO""!%# # 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r   r@  )rK   rL   rM   r   r<   r   rO   rP   rN   r   r   rJ   rQ   rR   s   @r   rl  rl  I  s    0 T   -1&*,0#'L
llT)L
 t#L
 #Tk	L

 D[L
 
(	(L
 L
r   rl  )r3  rl  r   r  )rV   N)1rc   rO   r   torch.nnr    r   r  activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   utilsr   r   configuration_mobilevitv2r   
get_loggerrK   loggerr   r   r   r%   r  r'   rT   re   rr   r   r   r   r   r   r  r   r3  rB  rO  rd  rl  __all__r   r   r   <module>r     s    !   % & ! 9  . , 8 
		H	%
# 
 
C$J 
RU 
 ).fe - - - -Y^ -
=299 =B-F")) -Fb		 .<RYY <~&RYY &R")) 6RYY 0o1 odcp cpL & & &0 F
1 F
 F
R 5
(B 5
5
rRYY 09bii 9z299 8 
X
)C X

X
vr   