
    iz              	       d   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ  e       r	ddlmZmZ nd Zd Z ej4                  e      Ze ed       G d de                    Ze ed       G d de                    Ze ed       G d de                    Z G d dej@                        Z! G d dej@                        Z" G d dej@                        Z#dBd ejH                  d!e%d"e&d#ejH                  fd$Z' G d% d&ej@                        Z( G d' d(ej@                        Z) G d) d*ej@                        Z* G d+ d,ej@                        Z+ G d- d.ej@                        Z, G d/ d0ej@                        Z- G d1 d2ej@                        Z. G d3 d4ej@                        Z/ G d5 d6ej@                        Z0e G d7 d8e             Z1e G d9 d:e1             Z2 ed;       G d< d=e1             Z3 ed>       G d? d@e	e1             Z4g dAZ5y)Cz9PyTorch Dilated Neighborhood Attention Transformer model.    N)	dataclass)nn   )ACT2FN)BackboneMixin)BackboneOutput)PreTrainedModel)ModelOutputOptionalDependencyNotAvailableauto_docstringis_natten_availableloggingrequires_backends   )DinatConfig)
natten2davnatten2dqkrpbc                      t               Nr   argskwargss     d/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/dinat/modeling_dinat.pyr   r   )       ,..    c                      t               r   r   r   s     r   r   r   ,   r   r   zO
    Dinat encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	ej                  df   dz  ed<   dZ
e	ej                  df   dz  ed<   dZe	ej                  df   dz  ed<   y)DinatEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r!   torchFloatTensor__annotations__r"   tupler#   r$    r   r   r    r    6   s}     37u((4/6:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr   r    zW
    Dinat model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	DinatModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr!   pooler_output.r"   r#   r$   )r%   r&   r'   r(   r!   r)   r*   r+   r0   r"   r,   r#   r$   r-   r   r   r/   r/   L   s    	 37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr   r/   z1
    Dinat outputs for image classification.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	DinatImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlosslogits.r"   r#   r$   )r%   r&   r'   r(   r3   r)   r*   r+   r4   r"   r,   r#   r$   r-   r   r   r2   r2   e   s     &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr   r2   c                   f     e Zd ZdZ fdZdej                  dz  deej                     fdZ	 xZ
S )DinatEmbeddingsz6
    Construct the patch and position embeddings.
    c                     t         |           t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                        | _
        y r   )super__init__DinatPatchEmbeddingspatch_embeddingsr   	LayerNorm	embed_dimnormDropouthidden_dropout_probdropoutselfconfig	__class__s     r   r9   zDinatEmbeddings.__init__   sG     4V <LL!1!12	zz&"<"<=r   pixel_valuesNreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r;   r>   rA   )rC   rF   
embeddingss      r   forwardzDinatEmbeddings.forward   s4    **<8
YYz*
\\*-
r   )r%   r&   r'   r(   r9   r)   r*   r,   TensorrJ   __classcell__rE   s   @r   r6   r6      s4    >E$5$5$< u||AT r   r6   c                   `     e Zd ZdZ fdZdej                  dz  dej                  fdZ xZ	S )r:   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
    Transformer.
    c           
      P   t         |           |j                  }|j                  |j                  }}|| _        |dk(  rnt        d      t        j                  t        j                  | j                  |dz  ddd      t        j                  |dz  |ddd            | _	        y )N   z2Dinat only supports patch size of 4 at the moment.   r   r   rQ   rQ   r   r   )kernel_sizestridepadding)
r8   r9   
patch_sizenum_channelsr=   
ValueErrorr   
SequentialConv2d
projection)rC   rD   rX   rY   hidden_sizerE   s        r   r9   zDinatPatchEmbeddings.__init__   s    &&
$*$7$79I9Ik(? QRR--IId'')9vV\flmIIkQ&PV`fg
r   rF   NrG   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j	                  dddd      }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   rQ   r   r   )shaperY   rZ   r]   permute)rC   rF   _rY   heightwidthrI   s          r   rJ   zDinatPatchEmbeddings.forward   s`    )5););&<4,,,w  __\2
''1a3
r   )
r%   r&   r'   r(   r9   r)   r*   rK   rJ   rL   rM   s   @r   r:   r:      s/    
"	E$5$5$< 	 	r   r:   c                        e Zd ZdZej
                  fdedej                  ddf fdZde	j                  de	j                  fdZ xZS )	DinatDownsamplerz
    Convolutional Downsampling Layer.

    Args:
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    dim
norm_layerrG   Nc                     t         |           || _        t        j                  |d|z  dddd      | _         |d|z        | _        y )NrQ   rR   rS   rT   F)rU   rV   rW   bias)r8   r9   rg   r   r\   	reductionr>   )rC   rg   rh   rE   s      r   r9   zDinatDownsampler.__init__   sE    3CVF\binoq3w'	r   input_featurec                     | j                  |j                  dddd            j                  dddd      }| j                  |      }|S )Nr   r   r   rQ   )rk   ra   r>   )rC   rl   s     r   rJ   zDinatDownsampler.forward   sJ    }'<'<Q1a'HIQQRSUVXY[\]		-0r   )r%   r&   r'   r(   r   r<   intModuler9   r)   rK   rJ   rL   rM   s   @r   rf   rf      sJ     :< (C (RYY ($ (U\\ ell r   rf   input	drop_probtrainingrG   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   )dtypedevice)r`   ndimr)   randru   rv   floor_div)rp   rq   rr   	keep_probr`   random_tensoroutputs          r   	drop_pathr~      s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
DinatDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nrq   rG   c                 0    t         |           || _        y r   )r8   r9   rq   )rC   rq   rE   s     r   r9   zDinatDropPath.__init__   s    "r   r"   c                 D    t        || j                  | j                        S r   )r~   rq   rr   rC   r"   s     r   rJ   zDinatDropPath.forward   s    FFr   c                      d| j                    S )Nzp=)rq   rC   s    r   
extra_reprzDinatDropPath.extra_repr   s    DNN#$$r   r   )r%   r&   r'   r(   floatr9   r)   rK   rJ   strr   rL   rM   s   @r   r   r      sG    b#%$, #$ #GU\\ Gell G%C %r   r   c                   j     e Zd Z fdZ	 ddej
                  dedz  deej
                     fdZ xZ	S )NeighborhoodAttentionc                 *   t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        || _        || _        t        j                  t        j                  |d| j                  z  dz
  d| j                  z  dz
              | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j&                  |j(                        | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()rQ   r   )rj   )r8   r9   rZ   num_attention_headsrn   attention_head_sizeall_head_sizerU   dilationr   	Parameterr)   zerosrpbLinearqkv_biasquerykeyvaluer?   attention_probs_dropout_probrA   rC   rD   rg   	num_headsrU   r   rE   s         r   r9   zNeighborhoodAttention.__init__   sD   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP&  <<ID<L<L8Lq8PTUX\XhXhThklTl noYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr   r"   output_attentionsNrG   c                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }|t        j                  | j                        z  }t        ||| j                  | j                  | j                        }	t        j                  j!                  |	d      }
| j#                  |
      }
t%        |
|| j                  | j                        }|j'                  ddddd      j)                         }|j+                         d d | j,                  fz   }|j                  |      }|r||
f}|S |f}|S )	Nr   rQ   )rg   r   r   rP   )r`   r   viewr   r   	transposer   r   mathsqrtr   r   rU   r   r   
functionalsoftmaxrA   r   ra   
contiguoussizer   )rC   r"   r   
batch_size
seq_lengthrb   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r   rJ   zNeighborhoodAttention.forward  s   
 %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 "DIId.F.F$GG )i4K[K[]a]j]jk --//0@b/I ,,7"?KAQAQSWS`S`a%--aAq!<GGI"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r   F
r%   r&   r'   r9   r)   rK   boolr,   rJ   rL   rM   s   @r   r   r      s@    G2 */,||,  $;, 
u||		,r   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )NeighborhoodAttentionOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y r   )r8   r9   r   r   denser?   r   rA   rC   rD   rg   rE   s      r   r9   z$NeighborhoodAttentionOutput.__init__7  s6    YYsC(
zz&"E"EFr   r"   input_tensorrG   c                 J    | j                  |      }| j                  |      }|S r   r   rA   )rC   r"   r   s      r   rJ   z#NeighborhoodAttentionOutput.forward<  s$    

=1]3r   r%   r&   r'   r9   r)   rK   rJ   rL   rM   s   @r   r   r   6  s2    G
U\\  RWR^R^ r   r   c                   j     e Zd Z fdZ	 ddej
                  dedz  deej
                     fdZ xZ	S )NeighborhoodAttentionModulec                 l    t         |           t        |||||      | _        t	        ||      | _        y r   )r8   r9   r   rC   r   r}   r   s         r   r9   z$NeighborhoodAttentionModule.__init__D  s0    )&#y+xX	1&#>r   r"   r   NrG   c                 f    | j                  ||      }| j                  |d   |      }|f|dd  z   }|S Nr   r   )rC   r}   )rC   r"   r   self_outputsattention_outputr   s         r   rJ   z#NeighborhoodAttentionModule.forwardI  sC    
 yy0AB;;|AF#%QR(88r   r   r   rM   s   @r   r   r   C  s?    ? */||  $; 
u||		r   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DinatIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r8   r9   r   r   rn   	mlp_ratior   
isinstance
hidden_actr   r   intermediate_act_fnr   s      r   r9   zDinatIntermediate.__init__U  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r   r"   rG   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r   s     r   rJ   zDinatIntermediate.forward]  s&    

=100?r   r   rM   s   @r   r   r   T  s#    9U\\ ell r   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DinatOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y r   )
r8   r9   r   r   rn   r   r   r?   r@   rA   r   s      r   r9   zDinatOutput.__init__d  sF    YYs6#3#3c#9:C@
zz&"<"<=r   r"   rG   c                 J    | j                  |      }| j                  |      }|S r   r   r   s     r   rJ   zDinatOutput.forwardi  s$    

=1]3r   r   rM   s   @r   r   r   c  s#    >
U\\ ell r   r   c            	            e Zd Zd fd	Zd Z	 d	dej                  dedz  deej                  ej                  f   fdZ	 xZ
S )

DinatLayerc                    t         |           |j                  | _        |j                  | _        || _        | j                  | j                  z  | _        t        j                  ||j                        | _	        t        |||| j                  | j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        |j(                  dkD  r?t        j*                  |j(                  t-        j.                  d|f      z  d      | _        y d | _        y )Neps)rU   r   rt   r   rQ   T)requires_grad)r8   r9   chunk_size_feed_forwardrU   r   window_sizer   r<   layer_norm_epslayernorm_beforer   	attentionr   Identityr~   layernorm_afterr   intermediater   r}   layer_scale_init_valuer   r)   oneslayer_scale_parameters)rC   rD   rg   r   r   drop_path_raterE   s         r   r9   zDinatLayer.__init__p  s(   '-'E'E$!-- ++dmm; "Sf6K6K L4C0@0@4==
 ;I3:N~6TVT_T_Ta!||CV5J5JK-fc:!&#. ,,q0 LL66QH9MM]ab 	#  	#r   c                     | j                   }d}||k  s||k  rJdx}}t        d||z
        }t        d||z
        }	dd||||	f}t        j                  j	                  ||      }||fS )N)r   r   r   r   r   r   r   )r   maxr   r   pad)
rC   r"   rc   rd   r   
pad_valuespad_lpad_tpad_rpad_bs
             r   	maybe_padzDinatLayer.maybe_pad  s    &&'
K5;#6EE;./E;/0EQueU;JMM--mZHMj((r   r"   r   NrG   c                    |j                         \  }}}}|}| j                  |      }| j                  |||      \  }}|j                  \  }	}
}}	| j	                  ||      }|d   }|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j                         }| j                  | j                  d   |z  }|| j                  |      z   }| j                  |      }| j                  | j                  |            }| j                  | j                  d   |z  }|| j                  |      z   }|r	||d   f}|S |f}|S )N)r   r   r      r   )r   r   r   r`   r   r   r   r~   r   r}   r   )rC   r"   r   r   rc   rd   channelsshortcutr   rb   
height_pad	width_padattention_outputsr   
was_paddedlayer_outputlayer_outputss                    r   rJ   zDinatLayer.forward  s|   
 /<.@.@.B+
FE8 --m<$(NN=&%$P!z&3&9&9#:y! NN=L]N^,Q/]Q&;*Q-!*;
/7F7FUFA0EFQQS&&2#::1=@PP 4>>2B#CC++M:{{4#4#4\#BC&&266q9LHL$t~~l'CC@Q'8';< YeWfr   )rt   r   )r%   r&   r'   r9   r   r)   rK   r   r,   rJ   rL   rM   s   @r   r   r   o  sM    
(	) */$||$  $;$ 
u||U\\)	*	$r   r   c                   j     e Zd Z fdZ	 ddej
                  dedz  deej
                     fdZ xZ	S )
DinatStagec                 <   t         	|           || _        || _        t	        j
                  t        |      D cg c]  }t        |||||   ||          c}      | _        |% ||t        j                        | _
        d| _        y d | _
        d| _        y c c}w )N)rD   rg   r   r   r   )rg   rh   F)r8   r9   rD   rg   r   
ModuleListranger   layersr<   
downsamplepointing)
rC   rD   rg   depthr   	dilationsr   r   irE   s
            r   r9   zDinatStage.__init__  s    mm u	  !'&q\#1!#4	
 !(SR\\JDO  #DO%	s   Br"   r   NrG   c                     |j                         \  }}}}t        | j                        D ]  \  }} |||      }|d   } |}	| j                  | j                  |	      }||	f}
|r|
dd  z  }
|
S r   )r   	enumerater   r   )rC   r"   r   rb   rc   rd   r   layer_moduler   !hidden_states_before_downsamplingstage_outputss              r   rJ   zDinatStage.forward  s    
 ,00265!(5 	-OA|(8IJM)!,M	- -:)??& OO,MNM&(IJ]12..Mr   r   r   rM   s   @r   r   r     s?    8 */||  $; 
u||		r   r   c                   z     e Zd Z fdZ	 	 	 	 d
dej
                  dedz  dedz  dedz  dedz  deez  fd	Z	 xZ
S )DinatEncoderc                    t         |           t        |j                        | _        || _        t        j                  d|j                  t        |j                        d      D cg c]  }|j                          }}t        j                  t        | j                        D cg c]  }t        |t        |j                   d|z  z        |j                  |   |j"                  |   |j$                  |   |t        |j                  d |       t        |j                  d |dz           || j                  dz
  k  rt&        nd        c}      | _        y c c}w c c}w )Nr   cpu)rv   rQ   r   )rD   rg   r   r   r   r   r   )r8   r9   lendepths
num_levelsrD   r)   linspacer   sumitemr   r   r   r   rn   r=   r   r   rf   levels)rC   rD   xdpri_layerrE   s        r   r9   zDinatEncoder.__init__  s,   fmm,!&63H3H#fmmJ\ej!klAqvvxllmm  %T__5  !F,,q'z9: --0$..w7$..w7#&s6=='+B'Cc&--XeZadeZeJfFg#h4;dooPQ>Q4Q/X\
 ms   )E(B#Er"   r   Noutput_hidden_states(output_hidden_states_before_downsamplingreturn_dictrG   c                    |rdnd }|rdnd }|rdnd }|r |j                  dddd      }	||fz  }||	fz  }t        | j                        D ]l  \  }
} |||      }|d   }|d   }|r#|r!|j                  dddd      }	||fz  }||	fz  }n$|r"|s |j                  dddd      }	||fz  }||	fz  }|se||dd  z  }n |st        d |||fD              S t	        ||||      S )Nr-   r   r   r   rQ   c              3   &   K   | ]	  }||  y wr   r-   ).0vs     r   	<genexpr>z'DinatEncoder.forward.<locals>.<genexpr>!  s     mq_`_lms   )r!   r"   r#   r$   )ra   r  r  r,   r    )rC   r"   r   r  r  r  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsreshaped_hidden_stater   r  r   r  s                 r   rJ   zDinatEncoder.forward  s]    #7BD+?RT"$5b4$1$9$9!Q1$E!-!11&+@*BB&(5 	9OA|(8IJM)!,M0=a0@-#(P(I(Q(QRSUVXY[\(]%!&G%II!*/D.FF*%.V(5(=(=aAq(I%!m%55!*/D.FF* #}QR'88#%	9( m]4EGZ$[mmm!++*#=	
 	
r   )FFFT)r%   r&   r'   r9   r)   rK   r   r,   r    rJ   rL   rM   s   @r   r  r    so    
. */,1@E#'.
||.
  $;.
 #Tk	.

 37+.
 D[.
 
#	#.
r   r  c                   $    e Zd ZU eed<   dZdZdZy)DinatPreTrainedModelrD   dinatrF   )imageN)r%   r&   r'   r   r+   base_model_prefixmain_input_nameinput_modalitiesr-   r   r   r!  r!  +  s    $O!r   r!  c                        e Zd Zd
 fd	Zd Ze	 	 	 	 ddej                  dz  dedz  dedz  dedz  de	e
z  f
d	       Z xZS )
DinatModelc                    t         |   |       t        | dg       || _        t	        |j
                        | _        t        |j                  d| j                  dz
  z  z        | _	        t        |      | _        t        |      | _        t        j                  | j                  |j                         | _        |rt        j$                  d      nd| _        | j)                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        nattenrQ   r   r   N)r8   r9   r   rD   r
  r  r  rn   r=   num_featuresr6   rI   r  encoderr   r<   r   	layernormAdaptiveAvgPool1dpooler	post_init)rC   rD   add_pooling_layerrE   s      r   r9   zDinatModel.__init__5  s    
 	 $
+fmm, 0 0119L3M MN)&1#F+d&7&7V=R=RS1Bb**1- 	r   c                 .    | j                   j                  S r   rI   r;   r   s    r   get_input_embeddingszDinatModel.get_input_embeddingsK      ///r   NrF   r   r  r  rG   c                 R   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |      }| j                  ||||      }|d   }| j                  |      }d }	| j                  G| j                  |j                  dd      j                  dd            }	t        j                  |	d      }	|s||	f|dd  z   }
|
S t        ||	|j                  |j                  |j                        S )Nz You have to specify pixel_valuesr   r  r  r   r   rQ   )r!   r0   r"   r#   r$   )rD   r   r  use_return_dictrZ   rI   r,  r-  r/  flattenr   r)   r/   r"   r#   r$   )rC   rF   r   r  r  r   embedding_outputencoder_outputssequence_outputpooled_outputr}   s              r   rJ   zDinatModel.forwardN  sA    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,/!5#	 ' 
 *!,..9;;" KK(?(?1(E(O(OPQST(UVM!MM-;M%}58KKFM-')77&11#2#I#I
 	
r   )T)NNNN)r%   r&   r'   r9   r4  r   r)   r*   r   r,   r/   rJ   rL   rM   s   @r   r(  r(  3  s|    ,0  26)-,0#'-
''$.-
  $;-
 #Tk	-

 D[-
 
!	!-
 -
r   r(  z
    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    c                        e Zd Z fdZe	 	 	 	 	 d
dej                  dz  dej                  dz  dedz  dedz  dedz  de	e
z  fd	       Z xZS )DinatForImageClassificationc                 X   t         |   |       t        | dg       |j                  | _        t	        |      | _        |j                  dkD  r4t        j                  | j
                  j                  |j                        nt        j                         | _
        | j                          y )Nr*  r   )r8   r9   r   
num_labelsr(  r"  r   r   r+  r   
classifierr0  rB   s     r   r9   z$DinatForImageClassification.__init__  s     $
+ ++'
 FLEVEVYZEZBIIdjj--v/@/@A`b`k`k`m 	
 	r   NrF   labelsr   r  r  rG   c                 T   ||n| j                   j                  }| j                  ||||      }|d   }| j                  |      }	d}
|| j	                  ||	| j                         }
|s|	f|dd z   }|
|
f|z   S |S t        |
|	|j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr7  r   rQ   )r3   r4   r"   r#   r$   )	rD   r8  r"  rB  loss_functionr2   r"   r#   r$   )rC   rF   rC  r   r  r  r   r   r=  r4   r3   r}   s               r   rJ   z#DinatForImageClassification.forward  s      &1%<k$++B]B]**/!5#	  
  
/%%ffdkkBDY,F)-)9TGf$EvE)!//))#*#A#A
 	
r   )NNNNN)r%   r&   r'   r9   r   r)   r*   
LongTensorr   r,   r2   rJ   rL   rM   s   @r   r?  r?    s       26*.)-,0#'*
''$.*
   4'*
  $;	*

 #Tk*
 D[*
 
+	+*
 *
r   r?  zL
    NAT backbone, to be used with frameworks like DETR and MaskFormer.
    c                   x     e Zd Z fdZd Ze	 	 	 d
dej                  dedz  dedz  dedz  de	f
d	       Z
 xZS )DinatBackbonec           	         t         |   |       t        | dg       t        |      | _        t        |      | _        |j                  gt        t        |j                              D cg c]  }t        |j                  d|z  z         c}z   | _        i }t        | j                  | j                        D ]  \  }}t!        j"                  |      ||<    t!        j$                  |      | _        | j)                          y c c}w )Nr*  rQ   )r8   r9   r   r6   rI   r  r,  r=   r   r
  r  rn   r+  zipout_featuresr   r   r<   
ModuleDicthidden_states_normsr0  )rC   rD   r   rM  stagerY   rE   s         r   r9   zDinatBackbone.__init__  s     $
+)&1#F+#--.X]^abhbobo^pXq1rST#f6F6FA6M2N1rr !#&t'8'8$--#H 	DE<)+l)C&	D#%==1D#E  	 2ss   *"Dc                 .    | j                   j                  S r   r3  r   s    r   r4  z"DinatBackbone.get_input_embeddings  r5  r   NrF   r  r   r  rG   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }| j                  ||ddd      }|j                  }d}	t        | j                  |      D ]  \  }
}|
| j                  v s|j                  \  }}}}|j                  dddd      j                         }|j                  |||z  |      } | j                  |
   |      }|j                  ||||      }|j                  dddd      j                         }|	|fz  }	 |s|	f}|r||j                  fz  }|S t!        |	|r|j                  nd|j"                  	      S )
a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 512, 7, 7]
        ```NT)r   r  r  r  r-   r   rQ   r   r   )feature_mapsr"   r#   )rD   r8  r  r   rI   r,  r$   rJ  stage_namesrK  r`   ra   r   r   rM  r"   r   r#   )rC   rF   r  r   r  r   r:  r   r"   rQ  rN  hidden_stater   rY   rc   rd   r}   s                    r   rJ   zDinatBackbone.forward  s   H &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,/!%59  
  66#&t'7'7#G 	0E<))):F:L:L7
L&%+33Aq!Q?JJL+00Ve^\Z>t77>|L+00VULY+33Aq!Q?JJL/	0 "_F#70022M%3G'//T))
 	
r   )NNN)r%   r&   r'   r9   r4  r   r)   rK   r   r   rJ   rL   rM   s   @r   rH  rH    ss    $0  -1)-#'J
llJ
 #TkJ
  $;	J

 D[J
 
J
 J
r   rH  )r?  r(  r!  rH  )rt   F)6r(   r   dataclassesr   r)   r   activationsr   backbone_utilsr   modeling_outputsr   modeling_utilsr	   utilsr
   r   r   r   r   r   configuration_dinatr   natten.functionalr   r   
get_loggerr%   loggerr    r/   r2   ro   r6   r:   rf   rK   r   r   r~   r   r   r   r   r   r   r   r   r  r!  r(  r?  rH  __all__r-   r   r   <module>r_     s{   @  !   ! + . -  - ;;// 
		H	% 
H H H  
H{ H H& 
H H H*bii ,!299 !Hryy 0U\\ e T V[VbVb  %BII %CBII CL
")) 
")) "		 	")) 	D DN, ,^C
299 C
L "? " " H
% H
 H
V <
"6 <
<
~ 
a
M#7 a

a
H ar   