
    #i{                        d Z ddlZddlmZ ddlmZmZmZ ddlmZ	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  ejP                  e)      Z* G d de"      Z+ G d de%      Z, G d de!      Z- G d de#      Z.e G d de             Z/ G d de$      Z0 ed       G d d e/e             Z1e G d! d"e/             Z2 G d# d$ejf                        Z4 ed%       G d& d'e/             Z5e G d( d)e/             Z6e G d* d+e/             Z7 G d, d-ejf                        Z8e G d. d/e/             Z9g d0Z:y)1zPyTorch RoBERTa model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)gelu)GenerationMixin),BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )BertCrossAttentionBertEmbeddings	BertLayer	BertModelBertSelfAttention   )RobertaConfigc                        e Zd Z fdZ	 	 	 	 	 ddej
                  dz  dej
                  dz  dej
                  dz  dej                  dz  def
dZe	d	        Z
e	dd
       Z xZS )RobertaEmbeddingsc                     t         |   |       | `| `|j                  | _        t        j                  |j                  |j                  | j                        | _        y )N)padding_idx)	super__init__pad_token_idposition_embeddingsr"   nn	Embeddingmax_position_embeddingshidden_sizeselfconfig	__class__s     g/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/roberta/modular_roberta.pyr$   zRobertaEmbeddings.__init__-   sT     $!..#%<<**F,>,>DL\L\$
     N	input_idstoken_type_idsposition_idsinputs_embedspast_key_values_lengthc                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )Nr2   r   r   )dimindexdtypedevice)"create_position_ids_from_input_idsr"   &create_position_ids_from_inputs_embedssizehasattrr2   expandshapetorchgatherzeroslongr3   r<   word_embeddingstoken_type_embeddingsr&   	LayerNormdropout)r,   r1   r2   r3   r4   r5   input_shape
batch_size
seq_lengthbuffered_token_type_idsrH   
embeddingsr&   s                r/   forwardzRobertaEmbeddings.forward8   sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
r0   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr7   r   r:   r   )r?   rC   arangerF   r<   	unsqueezerA   )r4   r"   rK   sequence_lengthr3   s        r/   r>   z8RobertaEmbeddings.create_position_ids_from_inputs_embedsh   sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<r0   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   r8   )neintrC   cumsumtype_asrF   )r1   r"   r5   maskincremental_indicess        r/   r=   z4RobertaEmbeddings.create_position_ids_from_input_idsz   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r0   )NNNNr   )r   )__name__
__module____qualname__r$   rC   
LongTensorFloatTensorrX   rP   staticmethodr>   r=   __classcell__r.   s   @r/   r    r    ,   s    	
 .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$.` = =" 8 8r0   r    c                       e Zd Zy)RobertaSelfAttentionNr]   r^   r_    r0   r/   rf   rf          r0   rf   c                       e Zd Zy)RobertaCrossAttentionNrg   rh   r0   r/   rk   rk      ri   r0   rk   c                       e Zd Zy)RobertaLayerNrg   rh   r0   r/   rm   rm      ri   r0   rm   c                   n     e Zd ZeZdZdZdZdZdZ	dZ
eeedZ ej                           fd       Z xZS )RobertaPreTrainedModelrobertaT)hidden_states
attentionscross_attentionsc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j
                  |j                         yy)zInitialize the weightsr7   )r   r7   N)r#   _init_weights
isinstanceRobertaLMHeadinitzeros_biasr    copy_r3   rC   rR   rB   rA   r2   )r,   moduler.   s     r/   ru   z$RobertaPreTrainedModel._init_weights   s     	f%fm,KK$ 12JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 3r0   )r]   r^   r_   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrm   rf   rk   _can_record_outputsrC   no_gradru   rc   rd   s   @r/   ro   ro      sX     L!&*#N"&%*1 U]]_/ /r0   ro   c                         e Zd Zd fd	Z xZS )RobertaModelc                 &    t         |   | |       y N)r#   r$   )r,   r-   add_pooling_layerr.   s      r/   r$   zRobertaModel.__init__   s    v&r0   )T)r]   r^   r_   r$   rc   rd   s   @r/   r   r      s    ' 'r0   r   zS
    RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  deee	j                        dz  dedz  de	j                  dz  dee	j                  z  dee   dee	j                     ez  fd              Z xZS )RobertaForCausalLM)roberta.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzOIf you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`Fr   
r#   r$   
is_decoderloggerwarningr   rp   rw   lm_head	post_initr+   s     r/   r$   zRobertaForCausalLM.__init__   sL       NNlm#FeD$V, 	r0   c                 .    | j                   j                  S r   r   decoderr,   s    r/   get_output_embeddingsz(RobertaForCausalLM.get_output_embeddings       ||###r0   c                 &    || j                   _        y r   r   r,   new_embeddingss     r/   set_output_embeddingsz(RobertaForCausalLM.set_output_embeddings       -r0   Nr1   attention_maskr2   r3   r4   encoder_hidden_statesencoder_attention_masklabelspast_key_values	use_cachecache_positionlogits_to_keepkwargsreturnc                    |d}
 | j                   |f|||||||	|
|dd
|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )am  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
        >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
        >>> config.is_decoder = True
        >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NFT)
r   r2   r3   r4   r   r   r   r   r   return_dict)logitsr   
vocab_size)lossr   r   rq   rr   rs   rh   )rp   last_hidden_staterv   rX   slicer   loss_functionr-   r   r   r   rq   rr   rs   )r,   r1   r   r2   r3   r4   r   r   r   r   r   r   r   r   outputsrq   slice_indicesr   r   s                      r/   rP   zRobertaForCausalLM.forward   s    b I@LA
))%'"7#9+)A
 A
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD0#33!//))$55
 	
r0   )NNNNNNNNNNNr   )r]   r^   r_   _tied_weights_keysr$   r   r   r   r   rC   r`   ra   tupleboolTensorrX   r   r   r   rP   rc   rd   s   @r/   r   r      s    #N .

$.  .237260426:>;?*.BF!%.2-.Q
##d*Q
 ))D0Q
 ((4/	Q

 &&-Q
 ((4/Q
  %0047Q
 !& 1 1D 8Q
   4'Q
 uU%6%6784?Q
 $;Q
 t+Q
 ell*Q
 +,Q
 
u||	@	@Q
  Q
r0   r   c                       e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )RobertaForMaskedLMr   r   r   c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NznIf you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr   r   r+   s     r/   r$   zRobertaForMaskedLM.__init__0  sR     NN1
 $FeD$V, 	r0   c                 .    | j                   j                  S r   r   r   s    r/   r   z(RobertaForMaskedLM.get_output_embeddings?  r   r0   c                 &    || j                   _        y r   r   r   s     r/   r   z(RobertaForMaskedLM.set_output_embeddingsB  r   r0   Nr1   r   r2   r3   r4   r   r   r   r   r   c	                 t    | j                   |f||||||dd|	}
|
d   }| j                  |      }d}|a|j                  |j                        }t	               } ||j                  d| j                  j                        |j                  d            }t        |||
j                  |
j                        S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)r   r2   r3   r4   r   r   r   r   Nr7   r   r   rq   rr   )rp   r   tor<   r   viewr-   r   r   rq   rr   )r,   r1   r   r2   r3   r4   r   r   r   r   r   sequence_outputprediction_scoresmasked_lm_lossloss_fcts                  r/   rP   zRobertaForMaskedLM.forwardE  s    : $,,

))%'"7#9

 

 "!* LL9YY0778F')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r0   )NNNNNNNN)r]   r^   r_   r   r$   r   r   r   r   rC   r`   ra   r   r   r   r   r   rP   rc   rd   s   @r/   r   r   )  s'    #N .
$.  .237260426:>;?*.5
##d*5
 ))D05
 ((4/	5

 &&-5
 ((4/5
  %00475
 !& 1 1D 85
   4'5
 +,5
 
u||	~	-5
  5
r0   r   c                   (     e Zd ZdZ fdZd Z xZS )rw   z*Roberta Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        y )N)eps)r#   r$   r'   Linearr*   denserI   layer_norm_eps
layer_normr   r   	ParameterrC   rE   rz   r+   s     r/   r$   zRobertaLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	r0   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   r,   featuresr   xs       r/   rP   zRobertaLMHead.forward  s;    JJx GOOA LLOr0   r]   r^   r_   __doc__r$   rP   rc   rd   s   @r/   rw   rw     s    4Ar0   rw   z
    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS ) RobertaForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFr   )	r#   r$   
num_labelsr-   r   rp   RobertaClassificationHead
classifierr   r+   s     r/   r$   z)RobertaForSequenceClassification.__init__  sJ      ++#FeD3F; 	r0   Nr1   r   r2   r3   r4   r   r   r   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }
d}||j                  |
j                        }| j                  j
                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j
                  dk(  r=t               } ||
j                  d	| j                        |j                  d	            }n,| j                  j
                  dk(  rt               } ||
|      }t!        ||
|j"                  |j$                  
      S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr   r2   r3   r4   r   r   Nr   
regressionsingle_label_classificationmulti_label_classificationr7   r   )rp   r   r   r<   r-   problem_typer   r;   rC   rF   rX   r   squeezer   r   r   r   rq   rr   r,   r1   r   r2   r3   r4   r   r   r   r   r   r   r   s                r/   rP   z(RobertaForSequenceClassification.forward  s   6 $,,
))%'
 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r0   NNNNNN)r]   r^   r_   r$   r   r   rC   r`   ra   r   r   r   r   r   rP   rc   rd   s   @r/   r   r     s    	  .237260426*.C
##d*C
 ))D0C
 ((4/	C

 &&-C
 ((4/C
   4'C
 +,C
 
u||	7	7C
  C
r0   r   c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS )RobertaForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r#   r$   r   rp   r'   Dropouthidden_dropout_probrJ   r   r*   r   r   r+   s     r/   r$   z!RobertaForMultipleChoice.__init__  sV     #F+zz&"<"<=))F$6$6: 	r0   Nr1   r2   r   r   r3   r4   r   r   c           	      "   ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}	|!|j                  d|j                  d            nd}
|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |	f|
|||dd|}|d   }| j	                  |      }| j                  |      }|j                  d|      }d}|.|j                  |j                        }t               } |||      }t        |||j                  |j                        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r7   T)r3   r2   r   r4   r   r   )rB   r   r?   rp   rJ   r   r   r<   r   r   rq   rr   )r,   r1   r2   r   r   r3   r4   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   pooled_outputr   reshaped_logitsr   r   s                       r/   rP   z RobertaForMultipleChoice.forward  s   V -6,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 $,,
*..,
 
  
]3/ ++b+6YY556F')HOV4D("!//))	
 	
r0   r   )r]   r^   r_   r$   r   r   rC   r`   ra   r   r   r   r   r   rP   rc   rd   s   @r/   r   r     s      .22637*.0426P
##d*P
 ((4/P
 ))D0	P

   4'P
 &&-P
 ((4/P
 +,P
 
u||	8	8P
  P
r0   r   c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eej                     ez  fd              Z xZS )RobertaForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r   )r#   r$   r   r   rp   classifier_dropoutr   r'   r   rJ   r   r*   r   r   r,   r-   r   r.   s      r/   r$   z&RobertaForTokenClassification.__init__R  s      ++#FeD)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r0   Nr1   r   r2   r3   r4   r   r   r   c           	      ~    | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|W|j                  |
j                        }t               } ||
j                  d| j                        |j                  d            }t        ||
|j                  |j                        S )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr   r   Nr7   r   )rp   rJ   r   r   r<   r   r   r   r   rq   rr   r   s                r/   rP   z%RobertaForTokenClassification.forward`  s    2 $,,
))%'
 
 "!*,,71YYv}}-F')HFKKDOO<fkk"oND$!//))	
 	
r0   r   )r]   r^   r_   r$   r   r   rC   r`   ra   r   r   r   r   r   rP   rc   rd   s   @r/   r   r   P  s      .237260426*.2
##d*2
 ))D02
 ((4/	2

 &&-2
 ((4/2
   4'2
 +,2
 
u||	4	42
  2
r0   r   c                   (     e Zd ZdZ fdZd Z xZS )r   z-Head for sentence-level classification tasks.c                 Z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        y r   )r#   r$   r'   r   r*   r   r   r   r   rJ   r   out_projr   s      r/   r$   z"RobertaClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHr0   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S )Nr   )rJ   r   rC   tanhr   r   s       r/   rP   z!RobertaClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r0   r   rd   s   @r/   r   r     s    7Ir0   r   c                   J    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	e
   deej                     ez  fd              Z xZS )RobertaForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r#   r$   r   r   rp   r'   r   r*   
qa_outputsr   r+   s     r/   r$   z$RobertaForQuestionAnswering.__init__  sU      ++#FeD))F$6$68I8IJ 	r0   Nr1   r   r2   r3   r4   start_positionsend_positionsr   r   c           	          | j                   |f||||dd|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   d	z  }t        ||||	j                  |	j                  
      S )a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Tr   r   r   r7   rV   N)ignore_indexr   )r   start_logits
end_logitsrq   rr   )rp   r  splitr   
contiguouslenr?   clampr   r   rq   rr   )r,   r1   r   r2   r3   r4   r  r  r   r   r   r   r	  r
  
total_lossignored_indexr   
start_lossend_losss                      r/   rP   z#RobertaForQuestionAnswering.forward  s   0 $,,
))%'
 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
r0   )NNNNNNN)r]   r^   r_   r$   r   r   rC   r`   ra   r   r   r   r   r   rP   rc   rd   s   @r/   r  r    s      .2372604263715>
##d*>
 ))D0>
 ((4/	>

 &&->
 ((4/>
 ))D0>
 ''$.>
 +,>
 
u||	;	;>
  >
r0   r  )r   r   r   r  r   r   r   ro   );r   rC   torch.nnr'   r   r   r    r   rx   activationsr   
generationr	   modeling_outputsr
   r   r   r   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   bert.modeling_bertr   r   r   r   r   configuration_robertar   
get_loggerr]   r   r    rf   rk   rm   ro   r   r   r   Modulerw   r   r   r   r   r  __all__rh   r0   r/   <module>r!     s      A A &  )   . & @ @ - l l 0 
		H	%\8 \8~	, 		. 		9 	 /_ / /2'9 '
 
k
/ k

k
\ R
/ R
 R
jBII , Q
'= Q
Q
h ]
5 ]
 ]
@ C
$: C
 C
L		 , K
"8 K
 K
\	r0   