
    i!                       d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'  e$jP                  e)      Z*dejV                  de,de,fdZ- G d dej\                        Z/ G d dej`                        Z1 G d de      Z2 G d de      Z3 G d dej`                        Z4 G d d ej`                        Z5e# G d! d"e!             Z6 G d# d$e6      Z7 G d% d&e6      Z8e# G d' d(e6             Z9 e#d)*       G d+ d,e6e             Z: e#d-*       G d. d/e6             Z;e# G d0 d1e6             Z< G d2 d3e6      Z= G d4 d5e6e      Z>g d6Z?y)7zPyTorch MVP model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringloggingtorch_compilable_check   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r    shifted_input_idss       `/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr*   .   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                   v     e Zd ZdZdedef fdZ	 d
dej                  dedej                  dz  f fd	Z xZ	S )MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr.   r/   	__class__s      r)   r5   z&MvpLearnedPositionalEmbedding.__init__D   s$     $++5}Er+   Nr   past_key_values_lengthposition_idsc                 $   |a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                        j                  |d      }n|j                  d      }t        | %  || j                  z         S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr2   )dtypedevicer"   r   )r$   torcharangelongweightr<   expand	unsqueezer4   forwardr3   )r6   r   r8   r9   bszseq_lenr7   s         r)   rC   z%MvpLearnedPositionalEmbedding.forwardJ   s    
 $??2A.LC <<&(>(HPUPZPZcgcncncucufS"o  (11!4Lw|dkk9::r+   )r   N)
__name__
__module____qualname____doc__intr5   r=   TensorrC   __classcell__r7   s   @r)   r-   r-   ?   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r+   r-   c                   p    e Zd ZdZ	 	 	 	 ddedededz  dedz  dedz  dedz  f fd	Z	 	 	 	 	 	 dd
ej                  dej                  dz  de
dz  dej                  dz  dej                  dz  dedej                  dz  deej                  ej                  dz  eej                     dz  f   fdZ xZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rT   )r4   r5   rP   rQ   rR   head_dimr&   scalingrS   rU   r   Lineark_projv_projq_projout_proj)r6   rP   rQ   rR   rS   rT   rU   r7   s          r)   r5   zMvpAttention.__init__]   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr+   hidden_stateskey_value_statespast_key_valuesattention_maskattn_promptoutput_attentionscache_positionreturnc                    |du}|j                         \  }	}
}| j                  |      | j                  z  }d}|St        |t              rA|j
                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j                  |	d| j                   | j"                        j%                  dd      }|j                  |	d| j                   | j"                        j%                  dd      }|T|s|nd}j'                  ||| j                  d|i      \  }}|r)t        |t              rd|j
                  | j                  <   |t)        j*                  |d   j-                  |	ddd      |gd	      }t)        j*                  |d   j-                  |	ddd      |gd	      }|\t)        j.                  |	d|
|d   j                  d            j1                  |j2                        }t)        j*                  ||gd	      }|	| j                   z  d| j"                  f}|j                  |	|
| j                   | j"                        j%                  dd      } |j4                  | } |j4                  | } |j4                  | }|j                  d      }t)        j6                  ||j%                  dd            }|j                         |	| j                   z  |
|fk7  r/t9        d
|	| j                   z  |
|f d|j                                |{|j                         |	d|
|fk7  r#t9        d|	d|
|f d|j                                |j                  |	| j                   |
|      |z   }|j                  |	| j                   z  |
|      }t:        j<                  j?                  |d	      }|r?|j                  |	| j                   |
|      }|j                  |	| j                   z  |
|      }nd}t:        j<                  jA                  || j@                  | jB                        }t)        j6                  ||      }|j                         |	| j                   z  |
| j"                  fk7  r7t9        d|	| j                   |
| j"                  f d|j                                |j                  |	| j                   |
| j"                        }|j%                  dd      }|j5                  |	|
| jD                        }| jG                  |      }||fS )z#Input shape: Batch x Time x ChannelNFr"   r   r2   re   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size ptrainingz `attn_output` should be of size )$sizer]   rY   
isinstancer   
is_updatedgetrU   cross_attention_cacheself_attention_cachelayerskeysvaluesr[   r\   viewrQ   rX   	transposeupdater=   catrA   zerostor<   reshapebmmr&   r   
functionalsoftmaxrR   rl   rP   r^   )r6   r_   r`   ra   rb   rc   rd   re   is_cross_attentionrD   tgt_len_query_statesro   curr_past_key_valuescurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                            r)   rC   zMvpAttention.forwardz   sN    .T9',,.Wa {{=1DLL@
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~>"KN$9$9#r2r$JJ#W]^_J 99k!n&;&;CR&Ll%[abcL)#kk#q';q>;N;Nq;QRUUVdVkVkl!&K+Hr!SDNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r+   )g        FTN)NNNNFN)rF   rG   rH   rI   rJ   floatboolr5   r=   rK   r
   tuplerC   rL   rM   s   @r)   rO   rO   Z   s.   G !$"' !%CC C 	C
 4KC TkC $;C@ 15(,.2+/"'.2s2||s2  ,,-s2 	s2
 t+s2 \\D(s2  s2 t+s2 
u||U\\D0%2E2LL	Ms2r+   rO   c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dedz  deej                  ej                  dz  f   f
d	Z	 xZ
S )MvpEncoderLayerconfigc                 f   t         |           |j                  | _        t	        | j                  |j
                  |j                        | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)rP   rQ   rR   )r4   r5   d_modelrP   rO   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrR   r	   activation_functionactivation_fnactivation_dropoutrZ   encoder_ffn_dimfc1fc2final_layer_normr6   r   r7   s     r)   r5   zMvpEncoderLayer.__init__   s    %nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r+   r_   rb   self_attn_promptrd   Nrf   c                    |}| j                  ||||      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|j                  t        j                  k(  rt        j                  |      j                         s#t        j                   |      j                         rEt        j"                  |j                        j$                  dz
  }t        j&                  || |      }||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r_   rb   rc   rd   rj   i  )minmax)r   r   r~   rR   rl   r   r   r   r   r   r   r;   r=   float16isinfanyisnanfinfor   clamp)r6   r_   rb   r   rd   residualr   clamp_values           r)   rC   zMvpEncoderLayer.forward  s|   $ !&*nn')(/	 '5 '
#| --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<%--/KK&**,M0J0N0N0P++m&9&9:>>EK!KKK<[YMl**r+   F)rF   rG   rH   r   r5   r=   FloatTensorr   r   rC   rL   rM   s   @r)   r   r      sz    =y =* */++((++ ))++  ++	++
  $;++ 
u  %"3"3d"::	;++r+   r   c                       e Zd Zddef fdZ	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  dej                  dz  de	ej                  e	ej                  ej                  f   dz  f   fdZ xZS )MvpDecoderLayerNr   c                    t         |           |j                  | _        t	        | j                  |j
                  |j                  d|      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d|      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)rP   rQ   rR   rS   rU   )rR   rS   rU   )r4   r5   r   rP   rO   decoder_attention_headsr   r   rR   r	   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normrZ   decoder_ffn_dimr   r   r   )r6   r   rU   r7   s      r)   r5   zMvpDecoderLayer.__init__0  s   %nn44,,
 ~~#F$>$>?"(";";$&LL$@!(NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r+   r_   rb   encoder_hidden_statesencoder_attention_maskr   cross_attn_promptra   rd   	use_cachere   rf   c                 0   |}| j                  ||||||
      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }d}|h|}| j                  ||||||      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|f}|r|||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r_   ra   rb   rc   rd   re   rj   N)r_   r`   rb   rc   ra   rd   )r   r   r~   rR   rl   r   r   r   r   r   r   r   r   )r6   r_   rb   r   r   r   r   ra   rd   r   re   r   self_attn_weightscross_attn_weightsoutputss                  r)   rC   zMvpDecoderLayer.forwardL  s   > ! ,0>>'+)(/) ,: ,
(( --mt||VZVcVc-d =011-@ " ,$H040A0A+!65- /"3 1B 1-M- MM11-4<<Z^ZgZg1hM$}4M 88GM !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m< ")+=>>Gr+   N)	NNNNNNFTN)rF   rG   rH   r   r5   r=   rK   r
   r   r   r   rC   rL   rM   s   @r)   r   r   /  s   =y => /3596:0415(,).!%.2M||M t+M  %||d2	M
 !&t 3M  ,,-M !<<$.M M  $;M $;M t+M 
u  %(9(95;L;L(L"MPT"TT	UMr+   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z	 xZ
S )
MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  ||      | _        y )Nrk   )r4   r5   r   rZ   denseDropoutrR   r^   )r6   r   r   r   r   r7   s        r)   r5   zMvpClassificationHead.__init__  sD     	YYy)4
zzN3		)[9r+   r_   rf   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r   )rR   r   r=   tanhr^   )r6   r_   s     r)   rC   zMvpClassificationHead.forward  sN    ]3

=1

=1]3m4r+   )rF   rG   rH   rI   rJ   r   r5   r=   rK   rC   rL   rM   s   @r)   r   r     sL    7
:
: 
: 	
:
 
:U\\ ell r+   r   c                   `     e Zd ZdZ fdZdej                  deej                     fdZ xZ	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c           	      8   t         |           |j                  | _        || _        || _        |j
                  |z  | _        t        j                  |j                        | _	        t        j                  |j                  |j
                        | _        t        j                  t        j                  |j
                  |j                        t        j                         t        j                  |j                  |dz  |j
                  z              | _        y )Nr   r2   )r4   r5   prompt_length
num_layersrQ   r   rX   r   r   rR   	Embeddingprompt_embedding
SequentialrZ   prompt_mid_dimGELUprompt_trans)r6   r   r   rQ   r7   s       r)   r5   zMvpPrompt.__init__  s    #11$")3zzFNN3 "V-A-A6>> RMMIIfnnf&;&;<GGIIIf++Z!^fnn-LM
r+   
prompt_idsrf   c                 *   | j                  | j                  |            }|j                  | j                  | j                  dz  | j
                  | j                        }| j                  |      }|j                  g d      j                  d      }|S )Nr2   )r   r2   r   r   )
r   r   rv   r   r   rQ   rX   rR   permutesplit)r6   r   prompts      r)   rC   zMvpPrompt.forward  sw    ""4#8#8#DET//11DdnnVZVcVcdf%-33A6r+   )
rF   rG   rH   rI   r5   r=   rK   r   rC   rL   rM   s   @r)   r   r     s+    3
%,, 53F r+   r   c                   B     e Zd ZU eed<   dZdZ fdZed        Z	 xZ
S )MvpPreTrainedModelr   modelTc                     t         |   |       t        |t              r t	        j
                  |j                         y y r   )r4   _init_weightsrn   MvpForConditionalGenerationinitzeros_final_logits_bias)r6   moduler7   s     r)   r   z MvpPreTrainedModel._init_weights  s2    f%f9:KK001 ;r+   c                     | j                   j                  }t        j                  g ddddd|gg| j                        }|j                  |      |d}|S )N)r      
      r2   r         r2   r<   )rb   r   )r   r   r=   tensorr<   ne)r6   	pad_tokenr   dummy_inputss       r)   r   zMvpPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r+   )rF   rG   rH   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr   rL   rM   s   @r)   r   r     s.    &*#2
  r+   r   c                        e Zd ZdZddedej                  dz  dedz  f fdZ	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
edz  dedz  dedz  deez  fdZ xZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    Nr   embed_tokens
use_promptc                    t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd| _        t        j                  |j                   || j                        | _        t%        |j                  |      | _        t        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        t        j2                  |      | _        || _        |r7|j8                  | _        t;        ||j,                  |j<                        | _        d| _         | jC                          y c c}w )N      ?F)"r4   r5   rR   encoder_layerdrop	layerdropr   r   padding_idxmax_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   
vocab_sizer   r-   embed_positions
ModuleListrangeencoder_layersr   rs   r   layernorm_embeddingr   r   r   r   r   gradient_checkpointing	post_init)r6   r   r   r   rP   r   r7   s         r)   r5   zMvpEncoder.__init__  s4    ~~11NN	!..$*$B$B!393I3I499Y/sLL):):ItGWGWX<** 
 mmeFLaLaFb$c_V%<$cd#%<<	#: $!'!5!5D$-%%..%D! ',# %ds   :F
r   rb   inputs_embedsrd   output_hidden_statesreturn_dictrf   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |$|}|j
                  }	|j                  d|	d         }n-| |j                         dd }	|dddddf   }nt	        d      || j                  |      | j                  z  }| j                  |      }
||
z   }| j                  |      }t        j                  j                  || j                  | j                        }| j                   rIt#        j$                  | j&                        j)                  | j*                        }| j-                  |      }|t/        | j                   ||      }|rdnd}|rdnd}t1        | j2                        D ]t  \  }}|r||fz   }d}| j                  r&t#        j4                  g       }|| j6                  k  rd	}|rd
}n" |||| j                   r|   nd|      }|d   }|sl||d   fz   }v |r||fz   }|st9        d |||fD              S t;        |||      S )a8  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer"   z5You have to specify either input_ids or inputs_embedsrj   )r   r  rb    FT)NN)r   rd   r   r   c              3   &   K   | ]	  }||  y wr   r  .0vs     r)   	<genexpr>z%MvpEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   last_hidden_stater_   
attentions)r   rd   r  use_return_dictr&   r$   rv   rm   r   r  r  r  r   r~   rR   rl   r   r=   r>   r   r{   r<   r   r   	enumeraters   randr   r   r   )r6   r   rb   r  rd   r  r  kwargsinputinput_shape	embed_posr_   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r)   rC   zMvpEncoder.forward  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ]%>cdd"E++K!r;r?;I&',,.s3K!!Q(+ETUU  --i84;K;KKM((/	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ %6{{+-N  40d"+DKK"8 	FC#!/=2B!BG}}&+jjn#&7"G , -!"?C&6s&;TX&7	! !.a 0 !/=3C2E!E/	F2  +}.>>Ne]NN$Seee+>Vd
 	
r+   NF)NNNNNN)rF   rG   rH   rI   r   r   r   r   r5   r=   
LongTensorrK   r   r   r   rC   rL   rM   s   @r)   r   r     s    y t8K `dgk`k F .2.226)-,0#'t
##d*t
 t+t
 ((4/	t

  $;t
 #Tkt
 D[t
 
	 t
r+   r   c                   D    e Zd ZdZddededz  f fdZ	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fdZ xZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r   r   Nc           	         t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd| _        t        j                  |j                   |j                  | j                        | _        t%        |j                  |j                        | _        t        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        t        j2                  |j                        | _        || _        |r]|j8                  | _        t;        ||j,                  |j<                        | _        t;        ||j,                  |j<                        | _         d| _!        | jE                          y c c}w )Nr   )rU   F)#r4   r5   rR   decoder_layerdropr   r   r   r   max_target_positionsr  r  r  r   r  r   r   r  r   r-   r  r  r	  decoder_layersr   rs   r   r  r   r   r   r   r   r   r  r  )r6   r   r   ir7   s       r)   r5   zMvpDecoder.__init__  sd    ~~11!..$*$B$B!8>8N8N499V^^4TWLL):):FNNDL\L\]<**NN 
 mmSXY_YnYnSo$pa_Vq%I$pq#%<<#? $!'!5!5D$-%%..%D!
 &/%%..&D" ',#' %qs   Gr   rb   r   r   ra   r  r   rd   r  r  re   rf   c                 b   ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
||t        d      |$|}|j                  }|j                  d|d         }n-| |j                         dd }|dddddf   }nt        d      || j                  |      | j                  z  }| j                  r%| j                  r|rt        j                  d       d}|rd|b|| j                   j                  r4t!        t#        | j                         t#        | j                               nt#        | j                         }||j%                         nd}|2t'        j(                  |||j                  d	   z   |j*                  
      }t-        | j                   ||||      }||t/        | j                   |||      }| j1                  ||      }||z   }| j3                  |      }t4        j6                  j9                  || j8                  | j                        }| j:                  rZt'        j(                  | j<                        j?                  | j*                        }| jA                  |      }| jC                  |      }|	rdnd}|rdnd}|r|dnd}tE        | jF                        D ]  \  }}|	r||fz  }| j                  r%t'        jH                  g       }|| jJ                  k  r? |||||| j:                  r|   nd| j:                  r|   nd||||
      }|d   }|s{||d	   fz  }|||d   fz  } |	r||fz  }|
stM        d |||||fD              S tO        |||||      S )aU  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer"   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   r   r   r   )r   r  rb   re   ra   )r   r  rb   r   rj   r  )r   r   r   ra   rd   r   re   r2   c              3   $   K   | ]  }|| 
 y wr   r  r  s     r)   r  z%MvpDecoder.forward.<locals>.<genexpr>d  s      = s   )r  ra   r_   r  cross_attentions)(r   rd   r  r   r  r&   r$   rv   rm   r   r  r  rl   loggerwarning_onceis_encoder_decoderr   r   get_seq_lengthr=   r>   r<   r   r   r  r  r   r~   rR   r   r   r{   r   r   r  rs   r  r   r   r   )r6   r   rb   r   r   ra   r  r   rd   r  r  re   r  r  r   r8   	positionsr_   r   r   r   all_hidden_statesall_self_attnsall_cross_attentionsr$  decoder_layerr'  r(  s                               r)   rC   zMvpDecoder.forward  s	   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E#//K!r;r?;I&',,.s3K!!Q(+Edee  --i84;K;KKM&&4==##p "	0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg!"\\&(>ATATUVAW(W`m`t`tN ,;;'))+
 !,1G1S%>{{+5&;	&" ((0FG	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ $ 6 6z B #7BD0d&7<Q<]rdh"+DKK"8 	@C#!m%55!}}&+jjn#&7)%'=;???"23"7PT=A__#4S#9RV /"3#-M *!,M =#3"55(4(]1-=,??(5	@:  -!11 ':K^]qr  
 9+++%1
 	
r+   r   )NNNNNNNNNNN)rF   rG   rH   rI   r   r   r5   r=   r*  rK   r   r
   r   r   rC   rL   rM   s   @r)   r,  r,    s#    y  dTk  H .2.2:>:>(,26!%)-,0#'.2|
##d*|
 t+|
  %0047	|

 !& 0 04 7|
 |
 ((4/|
 $;|
  $;|
 #Tk|
 D[|
 t+|
 
:	:|
r+   r,  c                       e Zd ZdgZdddZdef fdZd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  deej                     d	z  ded	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  ded	z  dej                  d	z  deez  fd       Z xZS )MvpModelr   zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr   c                 N   t         |   |       |j                  |j                  }}|j                  | _        t        j                  ||j                  |      | _        t        ||j                        | _
        t        ||j                        | _        | j                          y r   )r4   r5   r   r  r   r   r   r   sharedr   encoderr,  decoderr  )r6   r   r   r  r7   s       r)   r5   zMvpModel.__init__z  s~     "("5"5v7H7HZ ++ll:v~~{K!&&*;*;<!&&*;*;< 	r+   c                     | j                   S r   )rA  r6   s    r)   get_input_embeddingszMvpModel.get_input_embeddings  s    {{r+   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r   )rA  rB  r   rC  r6   values     r)   set_input_embeddingszMvpModel.set_input_embeddings  s)    $(KK!$(KK!r+   c                 *   | j                   sJ d       | j                  d       | j                  j                  j                  d       | j                  j                  j                  d       | j                  j
                  j                  d       y )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r   requires_grad_rB  r   rC  r   rE  s    r)   set_lightweight_tuningzMvpModel.set_lightweight_tuning  sj    j jjE"%%44T:%%44T:&&55d;r+   Nr   rb   decoder_input_idsdecoder_attention_maskencoder_outputsra   r  decoder_inputs_embedsr   rd   r  r  re   rf   c                 6   |D|B|t        d      t        || j                  j                  | j                  j                        }|
|
n| j                  j
                  }
||n| j                  j                  }|	|	n| j                  j                  }	||n| j                  j                  }|| j                  ||||
||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  |||d   ||||	|
|||      }|s||z   S t        |j                  |j                   |j"                  |j$                  |j&                  |j                  |j"                  |j$                  	      S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   rb   r  rd   r  r  r   r   r2   r  )r   rb   r   r   ra   r  r   rd   r  r  re   )r  ra   decoder_hidden_statesdecoder_attentionsr4  encoder_last_hidden_stater   encoder_attentions)r&   r*   r   r   r    rd   r  r   r  rB  rn   r   lenrC  r   r  ra   r_   r  r4  )r6   r   rb   rN  rO  rP  ra   r  rQ  r   rd   r  r  re   r  decoder_outputss                   r)   rC   zMvpModel.forward  s   V $)>)F  U  !34;;33T[[5W5W! 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1+//!5#) ' 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r+   NNNNNNNNNNNNN)rF   rG   rH   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r5   rF  rJ  rM  r   r=   r*  rK   listr   r
   r   r   r   rC   rL   rM   s   @r)   r?  r?  r  s   *=)>&'6'6
y 0
<  .2.259:>:>(,26:>!%)-,0#'.2i
##d*i
 t+i
 !++d2	i

 !& 0 04 7i
 e//047i
 i
 ((4/i
  %0047i
 $;i
  $;i
 #Tki
 D[i
 t+i
  
#	#!i
 i
r+   r?  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc            !           e Zd ZddiZdef fdZ	 ddededz  ded	ej                  f fd
Z
ded	dfdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                   dz  dej                  dz  dej                  dz  deej$                     dz  dedz  dej$                  dz  dej$                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                   dz  d	eez  fd       Zdej                   fdZ xZS )r   lm_head.weightzmodel.shared.weightr   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )Nr   r   FrW   )r4   r5   r?  r   register_bufferr=   rz   rA  r.   r   rZ   r   lm_headr  r   s     r)   r5   z$MvpForConditionalGeneration.__init__  s     f%
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r+   Nnew_num_tokenspad_to_multiple_ofmean_resizingrf   c                 L    t         |   |||      }| j                  |       |S r   )r4   resize_token_embeddings_resize_final_logits_bias)r6   rc  rd  re  new_embeddingsr7   s        r)   rg  z3MvpForConditionalGeneration.resize_token_embeddings  s.     8I[]jk&&~6r+   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr"   r   r   rh   r   )r   r$   r=   rz   r<   ry   ra  )r6   rc  old_num_tokensnew_bias
extra_biass        r)   rh  z5MvpForConditionalGeneration._resize_final_logits_bias  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r+   c                 n    | j                   j                          | j                  j                  d       y r)  r   rM  rb  rL  rE  s    r)   rM  z2MvpForConditionalGeneration.set_lightweight_tuning'  $    

))+##E*r+   r   rb   rN  rO  rP  ra   r  rQ  labelsr   rd   r  r  re   c                    ||n| j                   j                  }|	R|
rt        j                  d       d}
|7|5t	        |	| j                   j
                  | j                   j                        }| j                  |||||||||
||||      }| j                  |d         | j                  z   }d}|	Ft               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  	      S )	a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rb   rN  rP  rO  ra   r  rQ  r   rd   r  r  re   r   r"   r   	losslogitsra   rS  rT  r4  rU  r   rV  )r   r  r5  warningr*   r   r    r   rb  r   r   rv   r  r   ra   rS  rT  r4  rU  r   rV  )r6   r   rb   rN  rO  rP  ra   r  rQ  rq  r   rd   r  r  re   r  r   	lm_logitsmasked_lm_lossloss_fctoutputs                        r)   rC   z#MvpForConditionalGeneration.forward+  s   T &1%<k$++B]B]klI (-B-J$6DKK44dkk6X6X%! **)/+#9+'"7/!5#)  
 LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r+   c                 l    t        || j                  j                  | j                  j                        S r   )r*   r   r   r    )r6   rq  s     r)   %prepare_decoder_input_ids_from_labelszAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels  s%    !&$++*B*BDKKDfDfggr+   )NT)NNNNNNNNNNNNNN)rF   rG   rH   r[  r   r5   rJ   r   r   r   rg  rh  rM  r   r=   r*  rK   r\  r   r
   r   r   rC   r|  rL   rM   s   @r)   r   r     s    	/y  ae!7:TzY]	< < <+  .2.259:>:>(,26:>*.!%)-,0#'.2x
##d*x
 t+x
 !++d2	x

 !& 0 04 7x
 e//047x
 x
 ((4/x
  %0047x
   4'x
 $;x
  $;x
 #Tkx
 D[x
 t+x
" 
	 #x
 x
thELL hr+   r   z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                   ~    e Zd Zdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
ej                     dz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  deez  fd       Z xZS )MvpForSequenceClassificationr   c                     t        |   |fi | t        |      | _        t	        |j
                  |j
                  |j                  |j                        | _        | j                          y r   )
r4   r5   r?  r   r   r   
num_labelsclassifier_dropoutclassification_headr  )r6   r   r  r7   s      r)   r5   z%MvpForSequenceClassification.__init__  sZ    *6*f%
#8NNNN%%	$
  	r+   c                 n    | j                   j                          | j                  j                  d       y r)  )r   rM  r  rL  rE  s    r)   rM  z3MvpForSequenceClassification.set_lightweight_tuning  s&    

))+  //6r+   Nr   rb   rN  rO  rP  r  rQ  rq  r   rd   r  r  rf   c                    ||n| j                   j                  }|d}	|$|"t        d| j                  j                         | j                  ||||||||	|
||      }|d   }|j                  | j                   j                        j                  |j                        }t        t        j                  |j                  d            j                         dk(  d       ||ddf   j                  |j!                  d      d|j!                  d            dddddf   }| j#                  |      }d}|| j                   j$                  | j                   j&                  dk(  rd	| j                   _        nv| j                   j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j,                  k(  rd
| j                   _        nd| j                   _        | j                   j$                  d	k(  rSt/               }| j                   j&                  dk(  r& ||j1                         |j1                               }n |||      }n| j                   j$                  d
k(  rGt3               } ||j                  d| j                   j&                        |j                  d            }n,| j                   j$                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                  |j>                  |j@                  |jB                  |jD                  	      S )a
  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for 
rb   rN  rO  rP  r  rQ  r   rd   r  r  r   r   z7All examples must have the same number of <eos> tokens.r"   
regressionsingle_label_classificationmulti_label_classificationrs  )#r   r  NotImplementedErrorr7   rF   r   eqeos_token_idr{   r<   r   r=   unique_consecutivesumnumelrv   rm   r  problem_typer  r;   r?   rJ   r   squeezer   r   r   ra   rS  rT  r4  rU  r   rV  )r6   r   rb   rN  rO  rP  r  rQ  rq  r   rd   r  r  r  r   r_   eos_masksentence_representationru  rt  ry  rz  s                         r)   rC   z$MvpForSequenceClassification.forward  s	   J &1%<k$++B]B]I!:%J4>>KbKbJcd  **)/#9+'"7/!5#  
  
<< 8 89<<]=Q=QR$$X\\!_5;;=BE	
 #0!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*AB{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r+   )NNNNNNNNNNNN)rF   rG   rH   r   r5   rM  r   r=   r*  rK   r\  r   r   r   r   rC   rL   rM   s   @r)   r~  r~    sN   y 7  .2.259:>:>26:>*.!%)-,0#'K
##d*K
 t+K
 !++d2	K

 !& 0 04 7K
 e//047K
 ((4/K
  %0047K
   4'K
 $;K
  $;K
 #TkK
 D[K
 
0	0K
 K
r+   r~  c                       e Zd Z fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de	ej                     dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  deez  fd       Z xZS )MvpForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r1   )
r4   r5   r  r?  r   r   rZ   hidden_size
qa_outputsr  r   s     r)   r5   z MvpForQuestionAnswering.__init__S  s[      ++f%
))F$6$68I8IJ 	r+   c                 n    | j                   j                          | j                  j                  d       y r)  )r   rM  r  rL  rE  s    r)   rM  z.MvpForQuestionAnswering.set_lightweight_tuning_  s$    

))+&&u-r+   Nr   rb   rN  rO  rP  start_positionsend_positionsr  rQ  r   rd   r  r  rf   c                    ||n| j                   j                  }||d}
| j                  |||||||	|
|||      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   d	z  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  |j                  |j                  |j                   |j"                  |j$                  

      S )a
  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr  r   r   r"   rh   )ignore_indexr2   )
rt  start_logits
end_logitsra   rS  rT  r4  rU  r   rV  )r   r  r   r  r   r  
contiguousrW  rm   r   r   r   ra   rS  rT  r4  rU  r   rV  )r6   r   rb   rN  rO  rP  r  r  r  rQ  r   rd   r  r  r  r   sequence_outputru  r  r  
total_lossignored_indexry  
start_lossend_lossrz  s                             r)   rC   zMvpForQuestionAnswering.forwardc  s   V &1%<k$++B]B]&=+DI**)/#9+'"7/!5#  
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J F 0:/EZMF*Q6Q2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r+   rY  )rF   rG   rH   r5   rM  r   r=   rK   r*  r\  r   r   r   r   rC   rL   rM   s   @r)   r  r  Q  s^   
.  *..259:>:>371526:>!%)-,0#'F
<<$&F
 t+F
 !++d2	F

 !& 0 04 7F
 e//047F
 ))D0F
 ''$.F
 ((4/F
  %0047F
 $;F
  $;F
 #TkF
 D[F
  
4	4!F
 F
r+   r  c                   (     e Zd ZdZ fdZd Z xZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 d    t         |   |       t        |      | _        | j	                          y r   )r4   r5   r,  rC  r  r   s     r)   r5   zMvpDecoderWrapper.__init__  s&     !&)r+   c                 &     | j                   |i |S r   )rC  )r6   argsr  s      r)   rC   zMvpDecoderWrapper.forward  s    t||T,V,,r+   )rF   rG   rH   rI   r5   rC   rL   rM   s   @r)   r  r    s    

-r+   r  c                       e Zd ZddiZ fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dedz  dedz  de	j                  dz  dee	j                  z  deez  fd       Z xZS )MvpForCausalLMr_  z!model.decoder.embed_tokens.weightc                     d|_         d|_        t        |   |       t	        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y )NTFrW   )rS   r7  r4   r5   r  r   r   rZ   r  r  rb  r  r   s     r)   r5   zMvpForCausalLM.__init__   sX     $)! &v.
yy!3!3V5F5FUS 	r+   c                 B    | j                   j                  j                  S r   r   rC  r   rE  s    r)   rF  z#MvpForCausalLM.get_input_embeddings  s    zz!!...r+   c                 :    || j                   j                  _        y r   r  rH  s     r)   rJ  z#MvpForCausalLM.set_input_embeddings  s    */

'r+   c                 n    | j                   j                          | j                  j                  d       y r)  ro  rE  s    r)   rM  z%MvpForCausalLM.set_lightweight_tuning  rp  r+   Nr   rb   r   r   ra   r  rq  r   rd   r  r  re   logits_to_keeprf   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }| j                  j                  ||||||||	|
|
      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                        S )ap  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```N)
r   rb   r   r   ra   r  r   rd   r  r  r   r"   r   )rt  ru  ra   r_   r  r4  )r   rd   r  r  r   rC  rn   rJ   slicerb  r   rv   r  r   ra   r_   r  r4  )r6   r   rb   r   r   ra   r  rq  r   rd   r  r  re   r  r  r   r_   slice_indicesru  rt  ry  rz  s                         r)   rC   zMvpForCausalLM.forward  sl   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9+'/!5# % 
  
8B>SV8W~ot4]kmA}a,?@A')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r+   )NNNNNNNNNNNNr   )rF   rG   rH   r[  r5   rF  rJ  rM  r   r=   r*  rK   r   r
   r   rJ   r   r   rC   rL   rM   s   @r)   r  r    sf   *,OP	/0+  .2.2:>;?(,26*.!%)-,0#'.2-.P
##d*P
 t+P
  %0047	P

 !& 1 1D 8P
 P
 ((4/P
   4'P
 $;P
  $;P
 #TkP
 D[P
 t+P
 ell*P
  
2	2!P
 P
r+   r  )r  r   r  r~  r?  r   )@rI   r  r=   r   torch.nnr   r   r    r   r   activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   configuration_mvpr   
get_loggerrF   r5  rK   rJ   r*   r   r-   ModulerO   r   r   r   r   r   r   r,  r?  r   r~  r  r  r  __all__r  r+   r)   <module>r     s       A A & ! C C ) J 9   . D D ( 
		H	%%,, c [^ ";BLL ;6S2299 S2l<+0 <+~j0 j\BII 0		 2   *`
# `
Fh
# h
V N
! N
 N
b 
^h"4o ^h
^hB ^
#5 ^
^
B X
0 X
 X
x-* -i
' i
Xr+   