
    #i                        d dl mZ d dlZd dlmZ d dlmZmZmZ ddlm	Z
 ddlmZmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3  e,jh                  e5      Z6 G d dejn                        Z8	 	 dJdejn                  dejr                  dejr                  dejr                  dejr                  dz  de:dz  de:de&e*   fdZ; G d d ejn                        Z< G d! d"ejn                        Z= G d# d$ejn                        Z> G d% d&ejn                        Z? G d' d(ejn                        Z@ G d) d*ejn                        ZA G d+ d,e      ZBe+ G d- d.e$             ZC G d/ d0ejn                        ZD G d1 d2ejn                        ZE e+d34       G d5 d6eC             ZF e+d74       G d8 d9eCe             ZGe+ G d: d;eC             ZH G d< d=ejn                        ZI e+d>4       G d? d@eC             ZJe+ G dA dBeC             ZKe+ G dC dDeC             ZL G dE dFejn                        ZMe+ G dG dHeC             ZNg dIZOy)K    )CallableN)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )RobertaConfigc                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )RobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxepsposition_idsr$   F)
persistenttoken_type_ids)dtype)super__init__nn	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandzerosr,   sizelongr)   position_embeddingsselfconfig	__class__s     h/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/roberta/modeling_roberta.pyr3   zRobertaEmbeddings.__init__;   s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
     N	input_idsr0   r,   inputs_embedspast_key_values_lengthreturnc                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )Nr.   r0   r   r$   )dimindexr1   device)"create_position_ids_from_input_idsr)   &create_position_ids_from_inputs_embedsrG   hasattrr0   rE   shaperB   gatherrF   rH   r,   rX   r9   r;   rI   r<   r@   )rK   rP   r0   r,   rQ   rR   input_shape
batch_size
seq_lengthbuffered_token_type_idsr;   
embeddingsrI   s                rN   forwardzRobertaEmbeddings.forwardO   sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
rO   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr.   r$   rW   r   )rG   rB   rC   rH   rX   	unsqueezerE   )rQ   r)   r^   sequence_lengthr,   s        rN   rZ   z8RobertaEmbeddings.create_position_ids_from_inputs_embeds   sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<rO   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r$   rU   )neintrB   cumsumtype_asrH   )rP   r)   rR   maskincremental_indicess        rN   rY   z4RobertaEmbeddings.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77rO   )NNNNr   )r   )__name__
__module____qualname____doc__r3   rB   
LongTensorFloatTensorrj   Tensorrc   staticmethodrZ   rY   __classcell__rM   s   @rN   r'   r'   8   s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8rO   r'   modulequerykeyvalueattention_maskscalingr@   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr.            r   rh   )ptrainingr$   )
rG   rB   matmul	transposer4   
functionalsoftmaxr@   r   
contiguous)
ry   rz   r{   r|   r}   r~   r@   r   attn_weightsattn_outputs
             rN   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rO   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dedz  dej
                  dz  dee	   de
ej
                     fd	Z xZS )RobertaSelfAttentionNc                 @   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        |j&                  | _        || _        || _        y Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   )r2   r3   r7   num_attention_headsr[   
ValueErrorrL   rj   attention_head_sizeall_head_sizer~   r4   Linearrz   r{   r|   r>   attention_probs_dropout_probr@   
is_decoder	is_causal	layer_idxrK   rL   r   r   rM   s       rN   r3   zRobertaSelfAttention.__init__   sP    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rO   hidden_statesr}   past_key_valuescache_positionr   rS   c                    |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }	 | j                  |      j                  | j	                  dd      }
|A|}t        |t              r|j                  }|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  j&                  | j(                  d|\  }} |j*                  g |d j-                         }||fS )Nr.   r$   r   r           r@   r~   )r\   r   rz   viewr   r{   r|   
isinstancer   self_attention_cacheupdater   r   get_interfacerL   _attn_implementationr   r   r@   r   r~   reshaper   )rK   r   r}   r   r   r   r^   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                  rN   rc   zRobertaSelfAttention.forward   s    $))#2.CCbC$*B*BC 5djj/44lCMMaQRS0DHH]+00,?II!QO	4djj/44lCMMaQRS&&5#/+>?*9*N*N' &=%C%C!>2	&"I{ )@(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ *k));;;;FFHL((rO   FNNNNro   rp   rq   r3   rB   ru   rt   r   r   r   tuplerc   rw   rx   s   @rN   r   r      s}    #6 48(,.2-)||-) ))D0-) 	-)
 t+-) +,-) 
u||	-)rO   r   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dee	   de
ej
                     fd	Z xZS )RobertaCrossAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        || _        || _        y r   )r2   r3   r7   r   r[   r   rL   rj   r   r   r~   r4   r   rz   r{   r|   r>   r   r@   r   r   r   s       rN   r3   zRobertaCrossAttention.__init__  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rO   r   encoder_hidden_statesr}   r   r   rS   c                 V   |j                   d d \  }}|j                   d   }||d| j                  f}	||d| j                  f}
 | j                  |      j                  |	 j	                  dd      }|%|j
                  j                  | j                        nd}|]|r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n | j                  |      j                  |
 j	                  dd      } | j                  |      j                  |
 j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        j                   | j"                  j$                  t&              } || ||||f| j(                  sdn| j*                  j,                  | j.                  d|\  }}|j1                  ||d      j3                         }||fS )Nr.   r$   r   FTr   r   )r\   r   rz   r   r   
is_updatedgetr   cross_attention_cachelayerskeysvaluesr{   r|   r   r   r   rL   r   r   r   r@   r   r~   r   r   )rK   r   r   r}   r   r   bsztgt_lensrc_lenq_input_shapekv_input_shaper   r   r   r   r   r   r   s                     rN   rc   zRobertaCrossAttention.forward  s    %**3B/W'--a0gr4+C+CDwD,D,DE 5djj/44mDNNqRSTGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]K<!67<<nMWWXY[\]I@$**%:;@@.Q[[\]_`aK*)8)N)N)U)U{DNN*&	; >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ "))#w;FFHL((rO   r   r   )ro   rp   rq   r3   rB   ru   rt   r   r   r   r   rc   rw   rx   s   @rN   r   r     s    #4 ;?376:2)||2)  %00472) ))D0	2)
 -t32) +,2) 
u||	2)rO   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )RobertaSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr*   )r2   r3   r4   r   r7   denser<   r=   r>   r?   r@   rJ   s     rN   r3   zRobertaSelfOutput.__init__U  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rO   r   input_tensorrS   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   r@   r<   rK   r   r   s      rN   rc   zRobertaSelfOutput.forward[  7    

=1]3}|'CDrO   ro   rp   rq   r3   rB   ru   rc   rw   rx   s   @rN   r   r   T  1    >U\\  RWR^R^ rO   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZ xZS )RobertaAttentionNc                     t         |           || _        |rt        nt        } ||||      | _        t        |      | _        y )Nr   r   )r2   r3   is_cross_attentionr   r   rK   r   output)rK   rL   r   r   r   attention_classrM   s         rN   r3   zRobertaAttention.__init__c  s=    "43E/K_#Fi9U	'/rO   r   r}   r   encoder_attention_maskr   r   r   rS   c                     | j                   s|n|} | j                  |f||||d|\  }}	| j                  ||      }||	fS )N)r   r}   r   r   )r   rK   r   )
rK   r   r}   r   r   r   r   r   attention_outputr   s
             rN   rc   zRobertaAttention.forwardj  sg     04/F/FLb)2*
"7)+)*
 *
&,  ;;'7G--rO   )FNFNNNNNr   rx   s   @rN   r   r   b  s    0 48:>;?(,.2.||. ))D0.  %0047	.
 !& 1 1D 8. . t+. +,. 
u||	.rO   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RobertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r2   r3   r4   r   r7   intermediate_sizer   r   
hidden_actstrr	   intermediate_act_fnrJ   s     rN   r3   zRobertaIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rO   r   rS   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rK   r   s     rN   rc   zRobertaIntermediate.forward  s&    

=100?rO   r   rx   s   @rN   r   r     s#    9U\\ ell rO   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )RobertaOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r2   r3   r4   r   r   r7   r   r<   r=   r>   r?   r@   rJ   s     rN   r3   zRobertaOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rO   r   r   rS   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rN   rc   zRobertaOutput.forward  r   rO   r   rx   s   @rN   r   r     r   rO   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZd Z xZS )RobertaLayerNc                    t         |           |j                  | _        d| _        t	        ||j
                  |      | _        |j
                  | _        |j                  | _        | j                  r.| j
                  st        |  d      t	        |d|d      | _	        t        |      | _        t        |      | _        y )Nr$   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r2   r3   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   )rK   rL   r   rM   s      rN   r3   zRobertaLayer.__init__  s    '-'E'E$)&F<M<MYbc ++#)#=#= ##?? D6)g!hii"2##'	#D 07#F+rO   r   r}   r   r   r   r   r   rS   c                 "    | j                   ||f||d|\  }}	|}
| j                  r:|8t        | d      st        d|  d       | j                  |d ||fd|i|\  }}	|}
t        | j                  | j                  | j                  |
      }|S )N)r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   r   r[   r   r   r   feed_forward_chunkr   r   )rK   r   r}   r   r   r   r   r   self_attention_output_r   cross_attention_outputlayer_outputs                rN   rc   zRobertaLayer.forward  s     $24>>$
 ,)	$

 $
 q 1??4@4!12 =dV DD D 
 )<(;(;%%&	)
 !0) )%"A  60##T%A%A4CSCSUe
 rO   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rK   r   intermediate_outputr   s       rN   r   zRobertaLayer.feed_forward_chunk  s,    "//0@A{{#68HIrO   r   r   )ro   rp   rq   r3   rB   ru   rt   r   r   r   r   rc   r   rw   rx   s   @rN   r   r     s    ,, 48:>;?(,.2'||' ))D0'  %0047	'
 !& 1 1D 8' ' t+' +,' 
u||	'RrO   r   c                   n     e Zd ZeZdZdZdZdZdZ	dZ
eeedZ ej                           fd       Z xZS )RobertaPreTrainedModelrobertaT)r   
attentionscross_attentionsc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j
                  |j                         yy)zInitialize the weightsr.   r-   N)r2   _init_weightsr   RobertaLMHeadinitzeros_biasr'   copy_r,   rB   rC   r\   rE   r0   )rK   ry   rM   s     rN   r   z$RobertaPreTrainedModel._init_weights  s     	f%fm,KK$ 12JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 3rO   )ro   rp   rq   r%   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsrB   no_gradr   rw   rx   s   @rN   r   r     sX     L!&*#N"&%*1 U]]_/ /rO   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej
                  dz  d
e	e
   deej
                     ez  fdZ xZS )RobertaEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )N)r   )	r2   r3   rL   r4   
ModuleListrangenum_hidden_layersr   layer)rK   rL   irM   s      rN   r3   zRobertaEncoder.__init__  sF    ]]uU[UmUmOn#o!L1$E#op
#os   ANr   r}   r   r   r   	use_cacher   r   rS   c                     t        | j                        D ]  \  }	}
 |
|||f|||d|} t        ||r|      S d       S )N)r   r   r   )last_hidden_stater   )	enumerater  r   )rK   r   r}   r   r   r   r  r   r   r  layer_modules              rN   rc   zRobertaEncoder.forward  sq      )4 		OA|(% (> /- M		 9+/8O
 	
>B
 	
rO   NNNNNN)ro   rp   rq   r3   rB   ru   rt   r   boolr   r   r   r   rc   rw   rx   s   @rN   r  r    s    q 48:>;?(,!%.2
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 t+
 +,
 
u||	H	H
rO   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RobertaPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r2   r3   r4   r   r7   r   Tanh
activationrJ   s     rN   r3   zRobertaPooler.__init__  s9    YYv1163E3EF
'')rO   r   rS   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r!  )rK   r   first_token_tensorpooled_outputs       rN   rc   zRobertaPooler.forward#  s6     +1a40

#566rO   r   rx   s   @rN   r  r    s#    $
U\\ ell rO   r  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                       e Zd ZddgZd fd	Zd Zd Zeee		 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  de
j                  dz  dee   dee
j                     ez  fd                     Zd Z xZS )RobertaModelr'   r   c                     t         |   |       || _        d| _        t	        |      | _        t        |      | _        |rt        |      nd| _	        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        FN)r2   r3   rL   gradient_checkpointingr'   rb   r  encoderr  pooler	post_init)rK   rL   add_pooling_layerrM   s      rN   r3   zRobertaModel.__init__;  sU    
 	 &+#+F3%f-/@mF+d 	rO   c                 .    | j                   j                  S r   rb   r9   rK   s    rN   get_input_embeddingsz!RobertaModel.get_input_embeddingsL  s    ...rO   c                 &    || j                   _        y r   r0  )rK   r|   s     rN   set_input_embeddingsz!RobertaModel.set_input_embeddingsO  s    */'rO   NrP   r}   r0   r,   rQ   r   r   r   r  r   r   rS   c                 L   | j                   j                  r|	|	n| j                   j                  }	nd}	|	rd|b|| j                   j                  r4t	        t        | j                         t        | j                               nt        | j                         }|d u |d uz  rt        d      ||j                  }|j                  d   }n|j                  }|j                  d   }||j                         nd}|
t        j                  |||z   |      }
| j                  |||||      }| j                  |||||
|      \  }} | j                  |f|||||	|
|d	|}|j                  }| j                   | j!                  |      nd }t#        |||j$                  
      S )NF)rL   z:You must specify exactly one of input_ids or inputs_embedsr$   r   )rX   )rP   r,   r0   rQ   rR   )r}   r   embedding_outputr   r   r   )r}   r   r   r   r  r   r,   )r  pooler_outputr   )rL   r   r  is_encoder_decoderr   r   r   rX   r\   get_seq_lengthrB   rC   rb   _create_attention_masksr+  r  r,  r   r   )rK   rP   r}   r0   r,   rQ   r   r   r   r  r   r   rX   r`   rR   r6  encoder_outputssequence_outputr%  s                      rN   rc   zRobertaModel.forwardR  s   " ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  -t";<YZZ %%F"+J"))F&,,Q/JETE`!?!?!Afg!"\\*@BX[eBentuN??%)'#9 + 
 261M1M)#9-"7)+ 2N 2
.. '$,,

)"7#9+)%

 

 *;;8<8OO4UY;-'+;;
 	
rO   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)rL   rQ   r}   r   r   )rL   rQ   r}   )rL   rQ   r}   r   )rL   r   r   r   )rK   r}   r   r6  r   r   r   s          rN   r:  z$RobertaModel._create_attention_masks  sx     ;;!!/{{.-- /N 7{{.-N "-%>{{.5&;	&" 555rO   )T)
NNNNNNNNNN)ro   rp   rq   _no_split_modulesr3   r2  r4  r"   r#   r   rB   ru   r   r  r   r   r   r   rc   r:  rw   rx   s   @rN   r(  r(  ,  sR    -n="/0   *..2.2,0-1596:(,!%.2K
<<$&K
 t+K
 t+	K

 llT)K
 ||d*K
  %||d2K
 !&t 3K
 K
 $;K
 t+K
 +,K
 
u||	K	KK
    K
Z 6rO   r(  zS
    RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.
    c                        e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  deee	j                        dz  dedz  de	j                  dz  dee	j                  z  dee   dee	j                     ez  fd              Z xZS )RobertaForCausalLM)roberta.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzOIf you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`Fr.  
r2   r3   r   loggerwarningr(  r   r   lm_headr-  rJ   s     rN   r3   zRobertaForCausalLM.__init__  sL       NNlm#FeD$V, 	rO   c                 .    | j                   j                  S r   rI  decoderr1  s    rN   get_output_embeddingsz(RobertaForCausalLM.get_output_embeddings      ||###rO   c                 &    || j                   _        y r   rK  rK   new_embeddingss     rN   set_output_embeddingsz(RobertaForCausalLM.set_output_embeddings      -rO   NrP   r}   r0   r,   rQ   r   r   labelsr   r  r   logits_to_keepr   rS   c                    |d}
 | j                   |f|||||||	|
|dd
|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )am  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
        >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
        >>> config.is_decoder = True
        >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NFT)
r}   r0   r,   rQ   r   r   r   r  r   return_dict)logitsrT  r6   )lossrX  r   r   r   r    )r   r  r   rj   slicerI  loss_functionrL   r6   r   r   r   r   r   )rK   rP   r}   r0   r,   rQ   r   r   rT  r   r  r   rU  r   outputsr   slice_indicesrX  rY  s                      rN   rc   zRobertaForCausalLM.forward  s    b I@LA
))%'"7#9+)A
 A
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD0#33!//))$55
 	
rO   )NNNNNNNNNNNr   )ro   rp   rq   _tied_weights_keysr3   rM  rR  r!   r   rB   rs   rt   r   r  ru   rj   r   r   r   rc   rw   rx   s   @rN   r@  r@    s    #N .

$.  .237260426:>;?*.BF!%.2-.Q
##d*Q
 ))D0Q
 ((4/	Q

 &&-Q
 ((4/Q
  %0047Q
 !& 1 1D 8Q
   4'Q
 uU%6%6784?Q
 $;Q
 t+Q
 ell*Q
 +,Q
 
u||	@	@Q
  Q
rO   r@  c                       e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )RobertaForMaskedLMrA  rB  rC  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NznIf you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.FrE  rF  rJ   s     rN   r3   zRobertaForMaskedLM.__init__?  sR     NN1
 $FeD$V, 	rO   c                 .    | j                   j                  S r   rK  r1  s    rN   rM  z(RobertaForMaskedLM.get_output_embeddingsN  rN  rO   c                 &    || j                   _        y r   rK  rP  s     rN   rR  z(RobertaForMaskedLM.set_output_embeddingsQ  rS  rO   NrP   r}   r0   r,   rQ   r   r   rT  r   rS   c	                 t    | j                   |f||||||dd|	}
|
d   }| j                  |      }d}|a|j                  |j                        }t	               } ||j                  d| j                  j                        |j                  d            }t        |||
j                  |
j                        S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)r}   r0   r,   rQ   r   r   rW  r   Nr.   rY  rX  r   r   )r   rI  torX   r   r   rL   r6   r   r   r   )rK   rP   r}   r0   r,   rQ   r   r   rT  r   r]  r<  prediction_scoresmasked_lm_lossloss_fcts                  rN   rc   zRobertaForMaskedLM.forwardT  s    : $,,

))%'"7#9

 

 "!* LL9YY0778F')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rO   )NNNNNNNN)ro   rp   rq   r_  r3   rM  rR  r!   r   rB   rs   rt   r   r   r   ru   r   rc   rw   rx   s   @rN   ra  ra  8  s'    #N .
$.  .237260426:>;?*.5
##d*5
 ))D05
 ((4/	5

 &&-5
 ((4/5
  %00475
 !& 1 1D 85
   4'5
 +,5
 
u||	~	-5
  5
rO   ra  c                   (     e Zd ZdZ fdZd Z xZS )r   z*Roberta Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        y r   )r2   r3   r4   r   r7   r   r<   r=   
layer_normr6   rL  	ParameterrB   rF   r  rJ   s     rN   r3   zRobertaLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	rO   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r
   rm  rL  rK   featuresr   xs       rN   rc   zRobertaLMHead.forward  s;    JJx GOOA LLOrO   ro   rp   rq   rr   r3   rc   rw   rx   s   @rN   r   r     s    4ArO   r   z
    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS ) RobertaForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFrE  )	r2   r3   
num_labelsrL   r(  r   RobertaClassificationHead
classifierr-  rJ   s     rN   r3   z)RobertaForSequenceClassification.__init__  sJ      ++#FeD3F; 	rO   NrP   r}   r0   r,   rQ   rT  r   rS   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }
d}||j                  |
j                        }| j                  j
                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j
                  dk(  r=t               } ||
j                  d	| j                        |j                  d	            }n,| j                  j
                  dk(  rt               } ||
|      }t!        ||
|j"                  |j$                  
      S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr}   r0   r,   rQ   rW  r   Nr$   
regressionsingle_label_classificationmulti_label_classificationr.   rf  )r   rz  rg  rX   rL   problem_typerx  r1   rB   rH   rj   r   squeezer   r   r   r   r   r   rK   rP   r}   r0   r,   rQ   rT  r   r]  r<  rX  rY  rj  s                rN   rc   z(RobertaForSequenceClassification.forward  s   6 $,,
))%'
 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rO   r  )ro   rp   rq   r3   r!   r   rB   rs   rt   r   r   r   ru   r   rc   rw   rx   s   @rN   ru  ru    s    	  .237260426*.C
##d*C
 ))D0C
 ((4/	C

 &&-C
 ((4/C
   4'C
 +,C
 
u||	7	7C
  C
rO   ru  c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS )RobertaForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr$   )r2   r3   r(  r   r4   r>   r?   r@   r   r7   rz  r-  rJ   s     rN   r3   z!RobertaForMultipleChoice.__init__   sV     #F+zz&"<"<=))F$6$6: 	rO   NrP   r0   r}   rT  r,   rQ   r   rS   c           	      "   ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}	|!|j                  d|j                  d            nd}
|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |	f|
|||dd|}|d   }| j	                  |      }| j                  |      }|j                  d|      }d}|.|j                  |j                        }t               } |||      }t        |||j                  |j                        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr$   r.   T)r,   r0   r}   rQ   rW  rf  )r\   r   rG   r   r@   rz  rg  rX   r   r   r   r   )rK   rP   r0   r}   rT  r,   rQ   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr]  r%  rX  reshaped_logitsrY  rj  s                       rN   rc   z RobertaForMultipleChoice.forward
  s   V -6,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 $,,
*..,
 
  
]3/ ++b+6YY556F')HOV4D("!//))	
 	
rO   r  )ro   rp   rq   r3   r!   r   rB   rs   rt   r   r   r   ru   r   rc   rw   rx   s   @rN   r  r    s      .22637*.0426P
##d*P
 ((4/P
 ))D0	P

   4'P
 &&-P
 ((4/P
 +,P
 
u||	8	8P
  P
rO   r  c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS )RobertaForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y rw  )r2   r3   rx  r(  r   classifier_dropoutr?   r4   r>   r@   r   r7   rz  r-  rK   rL   r  rM   s      rN   r3   z&RobertaForTokenClassification.__init__a  s      ++#FeD)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rO   NrP   r}   r0   r,   rQ   rT  r   rS   c           	      ~    | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|W|j                  |
j                        }t               } ||
j                  d| j                        |j                  d            }t        ||
|j                  |j                        S )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr|  r   Nr.   rf  )r   r@   rz  rg  rX   r   r   rx  r   r   r   r  s                rN   rc   z%RobertaForTokenClassification.forwardo  s    2 $,,
))%'
 
 "!*,,71YYv}}-F')HFKKDOO<fkk"oND$!//))	
 	
rO   r  )ro   rp   rq   r3   r!   r   rB   rs   rt   r   r   r   ru   r   rc   rw   rx   s   @rN   r  r  _  s      .237260426*.2
##d*2
 ))D02
 ((4/	2

 &&-2
 ((4/2
   4'2
 +,2
 
u||	4	42
  2
rO   r  c                   (     e Zd ZdZ fdZd Z xZS )ry  z-Head for sentence-level classification tasks.c                 Z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        y r   )r2   r3   r4   r   r7   r   r  r?   r>   r@   rx  out_projr  s      rN   r3   z"RobertaClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrO   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r#  )r@   r   rB   tanhr  rp  s       rN   rc   z!RobertaClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rO   rs  rx   s   @rN   ry  ry    s    7IrO   ry  c                   J    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	e
   deej                     ez  fd              Z xZS )RobertaForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y rw  )
r2   r3   rx  r(  r   r4   r   r7   
qa_outputsr-  rJ   s     rN   r3   z$RobertaForQuestionAnswering.__init__  sU      ++#FeD))F$6$68I8IJ 	rO   NrP   r}   r0   r,   rQ   start_positionsend_positionsr   rS   c           	          | j                   |f||||dd|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   d	z  }t        ||||	j                  |	j                  
      S )a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Tr|  r   r$   r.   rh   N)ignore_indexr   )rY  start_logits
end_logitsr   r   )r   r  splitr  r   lenrG   clampr   r   r   r   )rK   rP   r}   r0   r,   rQ   r  r  r   r]  r<  rX  r  r  
total_lossignored_indexrj  
start_lossend_losss                      rN   rc   z#RobertaForQuestionAnswering.forward  s   0 $,,
))%'
 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
rO   )NNNNNNN)ro   rp   rq   r3   r!   r   rB   rs   rt   r   r   r   ru   r   rc   rw   rx   s   @rN   r  r    s      .2372604263715>
##d*>
 ))D0>
 ((4/	>

 &&->
 ((4/>
 ))D0>
 ''$.>
 +,>
 
u||	;	;>
  >
rO   r  )r@  ra  r  r  ru  r  r(  r   )Nr   )Pcollections.abcr   rB   torch.nnr4   r   r   r    r   r  activationsr	   r
   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r    utils.genericr!   r"   utils.output_capturingr#   configuration_robertar%   
get_loggerro   rG  Moduler'   ru   floatr   r   r   r   r   r   r   r   r   r  r  r(  r@  ra  r   ru  r  r  ry  r  __all__rZ  rO   rN   <module>r     s  , %   A A & ' C C ) J 9	 	 	 G & 6 @ @ I 5 0 
		H	%g8		 g8` !%II%<<% 
% <<	%
 LL4'% T\% % '(%8F)299 F)RJ)BII J)Z		 .ryy .>")) BII @- @F /_ / /2
RYY 
DBII  	J6) J6J6Z 
k
/ k

k
\ R
/ R
 R
jBII , Q
'= Q
Q
h ]
5 ]
 ]
@ C
$: C
 C
L		 , K
"8 K
 K
\	rO   