
    iG                     "   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7  e0jp                  e9      Z: G d dejv                        Z<	 	 d\dejv                  dejz                  dejz                  dejz                  dejz                  dz  de>dz  de>d e)e.   fd!Z? G d" d#ejv                        Z@ G d$ d%ejv                        ZA G d& d'ejv                        ZB G d( d)ejv                        ZC G d* d+ejv                        ZD G d, d-ejv                        ZE G d. d/e      ZF G d0 d1ejv                        ZG G d2 d3ejv                        ZH G d4 d5ejv                        ZI G d6 d7ejv                        ZJ G d8 d9ejv                        ZK G d: d;ejv                        ZL G d< d=ejv                        ZMe/ G d> d?e'             ZNe e/d@A       G dB dCe-                    ZO e/dDA       G dE dFeN             ZP e/dGA       G dH dIeN             ZQ e/dJA       G dK dLeNe             ZRe/ G dM dNeN             ZS e/dOA       G dP dQeN             ZT e/dRA       G dS dTeN             ZUe/ G dU dVeN             ZVe/ G dW dXeN             ZWe/ G dY dZeN             ZXg d[ZYy)]zPyTorch BERT model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )
BertConfigc                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
 xZS )BertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr'   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr/   sizelongselfconfig	__class__s     b/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.pyr6   zBertEmbeddings.__init__8   s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr3   r/   inputs_embedspast_key_values_lengthreturnc                    ||j                         }n|j                         d d }|\  }}|| j                  d d |||z   f   }|t        | d      rT| j                  j	                  |j
                  d   d      }	t        j                  |	d|      }	|	j	                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  |      }||z   }| j                  |      }| j                  |      }|S )Nr1   r3   r   r'   )dimindex)r4   device)rJ   r/   hasattrr3   rH   shaperF   gatherrI   rK   rY   r;   r?   r=   r@   rD   )rM   rR   r3   r/   rS   rT   input_shape
batch_size
seq_lengthbuffered_token_type_idsr?   
embeddingsr=   s                rP   forwardzBertEmbeddings.forwardH   sP     #..*K',,.s3K!,
J,,Q0FVlIl0l-lmL
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
rQ   )NNNNr   )__name__
__module____qualname____doc__r6   rF   
LongTensorFloatTensorintTensorrb   __classcell__rO   s   @rP   r*   r*   5   s    Q
$ .2260426&'(##d*( ((4/( &&-	(
 ((4/( !$( 
(rQ   r*   modulequerykeyvalueattention_maskscalingrD   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr1            r	   rW   )ptrainingr'   )
rJ   rF   matmul	transposer   
functionalsoftmaxrD   ry   
contiguous)
rm   rn   ro   rp   rq   rr   rD   rs   attn_weightsattn_outputs
             rP   eager_attention_forwardr   s   s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rQ   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dedz  dej
                  dz  dee	   de
ej
                     fd	Z xZS )BertSelfAttentionNc                 @   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        |j&                  | _        || _        || _        y Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()ru   )r5   r6   r9   num_attention_headsrZ   
ValueErrorrN   ri   attention_head_sizeall_head_sizerr   r   Linearrn   ro   rp   rB   attention_probs_dropout_probrD   
is_decoder	is_causal	layer_idxrM   rN   r   r   rO   s       rP   r6   zBertSelfAttention.__init__   sP    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rQ   hidden_statesrq   past_key_valuescache_positionrs   rU   c                    |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }	 | j                  |      j                  | j	                  dd      }
|A|}t        |t              r|j                  }|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t               } || ||	|
|f| j"                  sdn| j$                  j&                  | j(                  d|\  }} |j*                  g |d j-                         }||fS )Nr1   r'   rv   r           rD   rr   )r[   r   rn   viewr{   ro   rp   
isinstancer   self_attention_cacheupdater   r   get_interfacerN   _attn_implementationr   ry   rD   rx   rr   reshaper~   )rM   r   rq   r   r   rs   r]   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                  rP   rb   zBertSelfAttention.forward   s    $))#2.CCbC$*B*BC 5djj/44lCMMaQRS0DHH]+00,?II!QO	4djj/44lCMMaQRS&&5#/+>?*9*N*N' &=%C%C!>2	&"I{ )@(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ *k));;;;FFHL((rQ   FNNNNrc   rd   re   r6   rF   rj   rh   r   r   r!   tuplerb   rk   rl   s   @rP   r   r      s}    #6 48(,.2-)||-) ))D0-) 	-)
 t+-) +,-) 
u||	-)rQ   r   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dee	   de
ej
                     fd	Z xZS )BertCrossAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        | j                  dz  | _
        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        || _        || _        y r   )r5   r6   r9   r   rZ   r   rN   ri   r   r   rr   r   r   rn   ro   rp   rB   r   rD   r   r   r   s       rP   r6   zBertCrossAttention.__init__   sC    : ::a?PVXhHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rQ   r   encoder_hidden_statesrq   r   rs   rU   c                 V   |j                   d d \  }}|j                   d   }||d| j                  f}	||d| j                  f}
 | j                  |      j                  |	 j	                  dd      }|%|j
                  j                  | j                        nd}|]|r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n | j                  |      j                  |
 j	                  dd      } | j                  |      j                  |
 j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        j                   | j"                  j$                  t&              } || ||||f| j(                  sdn| j*                  j,                  | j.                  d|\  }}|j1                  ||d      j3                         }||fS )Nr1   r'   rv   FTr   r   )r[   r   rn   r   r{   
is_updatedgetr   cross_attention_cachelayerskeysvaluesro   rp   r   r   r   rN   r   r   ry   rD   rx   rr   r   r~   )rM   r   r   rq   r   rs   bsztgt_lensrc_lenq_input_shapekv_input_shaper   r   r   r   r   r   r   s                     rP   rb   zBertCrossAttention.forward   s    %**3B/W'--a0gr4+C+CDwD,D,DE 5djj/44mDNNqRSTGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]K<!67<<nMWWXY[\]I@$**%:;@@.Q[[\]_`aK*)8)N)N)U)U{DNN*&	; >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
!\ "))#w;FFHL((rQ   r   r   )rc   rd   re   r6   rF   rj   rh   r   r   r!   r   rb   rk   rl   s   @rP   r   r      s    #4 ;?376:2)||2)  %00472) ))D0	2)
 -t32) +,2) 
u||	2)rQ   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr-   )r5   r6   r   r   r9   denser@   rA   rB   rC   rD   rL   s     rP   r6   zBertSelfOutput.__init__&  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rQ   r   input_tensorrU   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rD   r@   rM   r   r   s      rP   rb   zBertSelfOutput.forward,  7    

=1]3}|'CDrQ   rc   rd   re   r6   rF   rj   rb   rk   rl   s   @rP   r   r   %  1    >U\\  RWR^R^ rQ   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZ xZS )BertAttentionNc                     t         |           || _        |rt        nt        } ||||      | _        t        |      | _        y )Nr   r   )r5   r6   is_cross_attentionr   r   rM   r   output)rM   rN   r   r   r   attention_classrO   s         rP   r6   zBertAttention.__init__4  s=    "40B,HY#Fi9U	$V,rQ   r   rq   r   encoder_attention_maskr   r   rs   rU   c                     | j                   s|n|} | j                  |f||||d|\  }}	| j                  ||      }||	fS )N)r   rq   r   r   )r   rM   r   )
rM   r   rq   r   r   r   r   rs   attention_outputr   s
             rP   rb   zBertAttention.forward;  sg     04/F/FLb)2*
"7)+)*
 *
&,  ;;'7G--rQ   )FNFNNNNNr   rl   s   @rP   r   r   3  s    - 48:>;?(,.2.||. ))D0.  %0047	.
 !& 1 1D 8. . t+. +,. 
u||	.rQ   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r5   r6   r   r   r9   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrL   s     rP   r6   zBertIntermediate.__init__S  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rQ   r   rU   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rM   r   s     rP   rb   zBertIntermediate.forward[  s&    

=100?rQ   r   rl   s   @rP   r   r   R  s#    9U\\ ell rQ   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
BertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r5   r6   r   r   r   r9   r   r@   rA   rB   rC   rD   rL   s     rP   r6   zBertOutput.__init__b  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rQ   r   r   rU   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rP   rb   zBertOutput.forwardh  r   rQ   r   rl   s   @rP   r   r   a  r   rQ   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dej
                  dz  d	ee	   d
e
ej
                     fdZd Z xZS )	BertLayerNc                    t         |           |j                  | _        d| _        t	        ||j
                  |      | _        |j
                  | _        |j                  | _        | j                  r.| j
                  st        |  d      t	        |d|d      | _	        t        |      | _        t        |      | _        y )Nr'   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r5   r6   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   )rM   rN   r   rO   s      rP   r6   zBertLayer.__init__p  s    '-'E'E$&v9J9JV_` ++#)#=#= ##?? D6)g!hii"/##'	#D -V4 (rQ   r   rq   r   r   r   r   rs   rU   c                 "    | j                   ||f||d|\  }}	|}
| j                  r:|8t        | d      st        d|  d       | j                  |d ||fd|i|\  }}	|}
t        | j                  | j                  | j                  |
      }|S )N)r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   r   rZ   r   r   r   feed_forward_chunkr   r   )rM   r   rq   r   r   r   r   rs   self_attention_output_r   cross_attention_outputlayer_outputs                rP   rb   zBertLayer.forward  s     $24>>$
 ,)	$

 $
 q 1??4@4!12 =dV DD D 
 )<(;(;%%&	)
 !0) )%"A  60##T%A%A4CSCSUe
 rQ   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rM   r   intermediate_outputr   s       rP   r   zBertLayer.feed_forward_chunk  s,    "//0@A{{#68HIrQ   r   r   )rc   rd   re   r6   rF   rj   rh   r   r   r!   r   rb   r   rk   rl   s   @rP   r   r   o  s    ), 48:>;?(,.2'||' ))D0'  %0047	'
 !& 1 1D 8' ' t+' +,' 
u||	'RrQ   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej
                  dz  d
e	e
   deej
                     ez  fdZ xZS )BertEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )N)r   )	r5   r6   rN   r   
ModuleListrangenum_hidden_layersr   layer)rM   rN   irO   s      rP   r6   zBertEncoder.__init__  sF    ]]ERXRjRjLk#lqIf$B#lm
#ls   ANr   rq   r   r   r   	use_cacher   rs   rU   c                     t        | j                        D ]  \  }	}
 |
|||f|||d|} t        ||r|      S d       S )N)r   r   r   )last_hidden_stater   )	enumerater   r   )rM   r   rq   r   r   r   r   r   rs   r   layer_modules              rP   rb   zBertEncoder.forward  sq      )4 		OA|(% (> /- M		 9+/8O
 	
>B
 	
rQ   NNNNNN)rc   rd   re   r6   rF   rj   rh   r   boolr   r!   r   r   rb   rk   rl   s   @rP   r   r     s    n 48:>;?(,!%.2
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 t+
 +,
 
u||	H	H
rQ   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
BertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r5   r6   r   r   r9   r   Tanh
activationrL   s     rP   r6   zBertPooler.__init__  s9    YYv1163E3EF
'')rQ   r   rU   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rM   r   first_token_tensorpooled_outputs       rP   rb   zBertPooler.forward  s6     +1a40

#566rQ   r   rl   s   @rP   r   r     s#    $
U\\ ell rQ   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r5   r6   r   r   r9   r   r   r   r   r   transform_act_fnr@   rA   rL   s     rP   r6   z$BertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrQ   r   rU   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r	  r@   r   s     rP   rb   z#BertPredictionHeadTransform.forward  s4    

=1--m<}5rQ   r   rl   s   @rP   r  r    s$    UU\\ ell rQ   r  c                   $     e Zd Z fdZd Z xZS )BertLMPredictionHeadc                    t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        y )NT)bias)r5   r6   r  	transformr   r   r9   r8   decoder	ParameterrF   rI   r  rL   s     rP   r6   zBertLMPredictionHead.__init__  s[    4V< yy!3!3V5F5FTRLLV->->!?@	rQ   c                 J    | j                  |      }| j                  |      }|S r   )r  r  r   s     rP   rb   zBertLMPredictionHead.forward  s$    }5]3rQ   rc   rd   re   r6   rb   rk   rl   s   @rP   r  r    s    ArQ   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r5   r6   r  predictionsrL   s     rP   r6   zBertOnlyMLMHead.__init__  s    /7rQ   sequence_outputrU   c                 (    | j                  |      }|S r   )r  )rM   r  prediction_scoress      rP   rb   zBertOnlyMLMHead.forward	  s     ,,_=  rQ   r   rl   s   @rP   r  r    s#    8!u|| ! !rQ   r  c                   $     e Zd Z fdZd Z xZS )BertOnlyNSPHeadc                 l    t         |           t        j                  |j                  d      | _        y Nrv   )r5   r6   r   r   r9   seq_relationshiprL   s     rP   r6   zBertOnlyNSPHead.__init__  s'     "		&*<*<a @rQ   c                 (    | j                  |      }|S r   )r  )rM   r  seq_relationship_scores      rP   rb   zBertOnlyNSPHead.forward  s    !%!6!6}!E%%rQ   r  rl   s   @rP   r  r    s    A&rQ   r  c                   $     e Zd Z fdZd Z xZS )BertPreTrainingHeadsc                     t         |           t        |      | _        t	        j
                  |j                  d      | _        y r  )r5   r6   r  r  r   r   r9   r  rL   s     rP   r6   zBertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @rQ   c                 N    | j                  |      }| j                  |      }||fS r   )r  r  )rM   r  r  r  r!  s        rP   rb   zBertPreTrainingHeads.forward  s0     ,,_=!%!6!6}!E "888rQ   r  rl   s   @rP   r#  r#    s    A
9rQ   r#  c                   n     e Zd ZeZdZdZdZdZdZ	dZ
eeedZ ej                           fd       Z xZS )BertPreTrainedModelbertT)r   
attentionscross_attentionsc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j
                  |j                         yy)zInitialize the weightsr1   r0   N)r5   _init_weightsr   r  initzeros_r  r*   copy_r/   rF   rG   r[   rH   r3   )rM   rm   rO   s     rP   r,  z!BertPreTrainedModel._init_weights3  s     	f%f23KK$/JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 0rQ   )rc   rd   re   r(   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsrF   no_gradr,  rk   rl   s   @rP   r'  r'  $  sX    L&*#N"&"'. U]]_/ /rQ   r'  z0
    Output type of [`BertForPreTraining`].
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   y)BertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   r)  )rc   rd   re   rf   r<  rF   rh   __annotations__r=  r>  r   r   r)   rQ   rP   r;  r;  >  s~    	 &*D%

d
")26u((4/68<U..5<59M5**+d2926Je''(4/6rQ   r;  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c                       e Zd ZddgZd fd	Zd Zd Zeee		 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  de
j                  dz  dee   dee
j                     ez  fd                     Zd Z xZS )	BertModelr*   r   c                     t         |   |       || _        d| _        t	        |      | _        t        |      | _        |rt        |      nd| _	        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        FN)r5   r6   rN   gradient_checkpointingr*   ra   r   encoderr   pooler	post_init)rM   rN   add_pooling_layerrO   s      rP   r6   zBertModel.__init__f  sU    
 	 &+#(0"6*,=j(4 	rQ   c                 .    | j                   j                  S r   ra   r;   rM   s    rP   get_input_embeddingszBertModel.get_input_embeddingsw  s    ...rQ   c                 &    || j                   _        y r   rJ  )rM   rp   s     rP   set_input_embeddingszBertModel.set_input_embeddingsz  s    */'rQ   NrR   rq   r3   r/   rS   r   r   r   r   r   rs   rU   c                 L   | j                   j                  r|	|	n| j                   j                  }	nd}	|	rd|b|| j                   j                  r4t	        t        | j                         t        | j                               nt        | j                         }|d u |d uz  rt        d      ||j                  }|j                  d   }n|j                  }|j                  d   }||j                         nd}|
t        j                  |||z   |      }
| j                  |||||      }| j                  |||||
|      \  }} | j                  |f|||||	|
|d	|}|j                  }| j                   | j!                  |      nd }t#        |||j$                  
      S )NF)rN   z:You must specify exactly one of input_ids or inputs_embedsr'   r   )rY   )rR   r/   r3   rS   rT   )rq   r   embedding_outputr   r   r   )rq   r   r   r   r   r   r/   )r   pooler_outputr   )rN   r   r   is_encoder_decoderr   r   r   rY   r[   get_seq_lengthrF   rG   ra   _create_attention_masksrE  r   rF  r   r   )rM   rR   rq   r3   r/   rS   r   r   r   r   r   rs   rY   r_   rT   rP  encoder_outputsr  r  s                      rP   rb   zBertModel.forward}  s   " ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  -t";<YZZ %%F"+J"))F&,,Q/JETE`!?!?!Afg!"\\*@BX[eBentuN??%)'#9 + 
 261M1M)#9-"7)+ 2N 2
.. '$,,

)"7#9+)%

 

 *;;8<8OO4UY;-'+;;
 	
rQ   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)rN   rS   rq   r   r   )rN   rS   rq   )rN   rS   rq   r   )rN   r   r   r   )rM   rq   r   rP  r   r   r   s          rP   rT  z!BertModel._create_attention_masks  sx     ;;!!/{{.-- /N 7{{.-N "-%>{{.5&;	&" 555rQ   )T)
NNNNNNNNNN)rc   rd   re   _no_split_modulesr6   rL  rN  r%   r&   r"   rF   rj   r   r   r   r!   r   r   rb   rT  rk   rl   s   @rP   rB  rB  W  sR    *;7"/0   *..2.2,0-1596:(,!%.2K
<<$&K
 t+K
 t+	K

 llT)K
 ||d*K
  %||d2K
 !&t 3K
 K
 $;K
 t+K
 +,K
 
u||	K	KK
    K
Z 6rQ   rB  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                   `    e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )BertForPreTraining&bert.embeddings.word_embeddings.weightcls.predictions.biaszcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r5   r6   rB  r(  r#  clsrG  rL   s     rP   r6   zBertForPreTraining.__init__  s4     f%	'/ 	rQ   c                 B    | j                   j                  j                  S r   r^  r  r  rK  s    rP   get_output_embeddingsz(BertForPreTraining.get_output_embeddings      xx##+++rQ   c                     || j                   j                  _        |j                  | j                   j                  _        y r   r^  r  r  r  rM   new_embeddingss     rP   set_output_embeddingsz(BertForPreTraining.set_output_embeddings  ,    '5$$2$7$7!rQ   NrR   rq   r3   r/   rS   labelsnext_sentence_labelrs   rU   c           	          | j                   |f||||dd|}	|	dd \  }
}| j                  |
|      \  }}d}|u|st               } ||j                  d| j                  j
                        |j                  d            } ||j                  dd      |j                  d            }||z   }t        ||||	j                  |	j                        S )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        Trq   r3   r/   rS   return_dictNrv   r1   )r<  r=  r>  r   r)  )	r(  r^  r   r   rN   r8   r;  r   r)  )rM   rR   rq   r3   r/   rS   ri  rj  rs   outputsr  r  r  r!  
total_lossloss_fctmasked_lm_lossnext_sentence_losss                     rP   rb   zBertForPreTraining.forward  s   R $))
))%'
 
 *1!&48HH_m4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J'/$:!//))
 	
rQ   NNNNNNN)rc   rd   re   _tied_weights_keysr6   ra  rg  r$   r"   rF   rj   r   r!   r   r;  rb   rk   rl   s   @rP   rY  rY    s    +S(>
,8  *..2.2,0-1&*37A
<<$&A
 t+A
 t+	A

 llT)A
 ||d*A
 t#A
 #\\D0A
 +,A
 
u||	7	7A
  A
rQ   rY  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c                        e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  dee	j                  z  dee   dee	j                     ez  fd              Z xZS )BertLMHeadModelrZ  r[  r\  c                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`FrH  
r5   r6   r   loggerwarningrB  r(  r  r^  rG  rL   s     rP   r6   zBertLMHeadModel.__init__]  sL       NNijf>	"6* 	rQ   c                 B    | j                   j                  j                  S r   r`  rK  s    rP   ra  z%BertLMHeadModel.get_output_embeddingsi  rb  rQ   c                     || j                   j                  _        |j                  | j                   j                  _        y r   rd  re  s     rP   rg  z%BertLMHeadModel.set_output_embeddingsl  rh  rQ   NrR   rq   r3   r/   rS   r   r   ri  r   r   r   logits_to_keeprs   rU   c                    |d}
 | j                   |f|||||||	|
|dd
|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        NFT)
rq   r3   r/   rS   r   r   r   r   r   rm  )logitsri  r8   )r<  r  r   r   r)  r*  r@  )r(  r   r   ri   slicer^  loss_functionrN   r8   r   r   r   r)  r*  )rM   rR   rq   r3   r/   rS   r   r   ri  r   r   r   r~  rs   rn  r   slice_indicesr  r<  s                      rP   rb   zBertLMHeadModel.forwardp  s    0 I@I		A
))%'"7#9+)A
 A
  118B>SV8W~ot4]k-=!(;<=%4%%pVFt{{OeOepiopD0#33!//))$55
 	
rQ   )NNNNNNNNNNNr   )rc   rd   re   rt  r6   ra  rg  r$   r"   rF   rj   r   r   ri   r   r!   r   r   rb   rk   rl   s   @rP   rv  rv  R  sf    +S(>

,8  *..2.2,0-1596:&*(,!%.2-.8
<<$&8
 t+8
 t+	8

 llT)8
 ||d*8
  %||d28
 !&t 38
 t#8
 8
 $;8
 t+8
 ell*8
 +,8
 
u||	@	@8
  8
rQ   rv  c                       e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )BertForMaskedLMrZ  r[  r\  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Frx  ry  rL   s     rP   r6   zBertForMaskedLM.__init__  sR     NN1
 f>	"6* 	rQ   c                 B    | j                   j                  j                  S r   r`  rK  s    rP   ra  z%BertForMaskedLM.get_output_embeddings  rb  rQ   c                     || j                   j                  _        |j                  | j                   j                  _        y r   rd  re  s     rP   rg  z%BertForMaskedLM.set_output_embeddings  rh  rQ   NrR   rq   r3   r/   rS   r   r   ri  rs   rU   c	                 >    | j                   |f||||||dd|	}
|
d   }| j                  |      }d}|Ft               } ||j                  d| j                  j
                        |j                  d            }t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)rq   r3   r/   rS   r   r   rm  r   Nr1   r<  r  r   r)  )	r(  r^  r   r   rN   r8   r   r   r)  )rM   rR   rq   r3   r/   rS   r   r   ri  rs   rn  r  r  rq  rp  s                  rP   rb   zBertForMaskedLM.forward  s    ( $))

))%'"7#9

 

 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rQ   )NNNNNNNN)rc   rd   re   rt  r6   ra  rg  r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r    s    +S(>
,8  *..2.2,0-1596:&*+
<<$&+
 t++
 t+	+

 llT)+
 ||d*+
  %||d2+
 !&t 3+
 t#+
 +,+
 
u||	~	-+
  +
rQ   r  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForNextSentencePredictionc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r5   r6   rB  r(  r  r^  rG  rL   s     rP   r6   z&BertForNextSentencePrediction.__init__   s4     f%	"6* 	rQ   NrR   rq   r3   r/   rS   ri  rs   rU   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }
d}|2t               } ||
j                  dd      |j                  d            }t	        ||
|j
                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        Trl  r'   Nr1   rv   r  )r(  r^  r   r   r   r   r)  )rM   rR   rq   r3   r/   rS   ri  rs   rn  r  seq_relationship_scoresrr  rp  s                rP   rb   z%BertForNextSentencePrediction.forward	  s    N $))
))%'
 
  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_*#*!//))	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r    s      *..2.2,0-1&*=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 t#=
 +,=
 
u||	:	:=
  =
rQ   r  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForSequenceClassificationc                 n   t         |   |       |j                  | _        || _        t	        |      | _        |j                  |j                  n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y r   )r5   r6   
num_labelsrN   rB  r(  classifier_dropoutrC   r   rB   rD   r   r9   
classifierrG  rM   rN   r  rO   s      rP   r6   z&BertForSequenceClassification.__init__R  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rQ   NrR   rq   r3   r/   rS   ri  rs   rU   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|| j                  j                  | j
                  dk(  rd| j                  _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt               }| j
                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j                  dk(  r=t               } ||
j                  d| j
                        |j                  d            }n,| j                  j                  dk(  rt               } ||
|      }t        ||
|j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Trl  r'   N
regressionsingle_label_classificationmulti_label_classificationr1   r  )r(  rD   r  rN   problem_typer  r4   rF   rK   ri   r   squeezer   r   r   r   r   r)  )rM   rR   rq   r3   r/   rS   ri  rs   rn  r  r  r<  rp  s                rP   rb   z%BertForSequenceClassification.forwarda  s   $ $))
))%'
 
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r  K  s      *..2.2,0-1&*;
<<$&;
 t+;
 t+	;

 llT);
 ||d*;
 t#;
 +,;
 
u||	7	7;
  ;
rQ   r  c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForMultipleChoicec                 *   t         |   |       t        |      | _        |j                  |j                  n|j
                  }t        j                  |      | _        t        j                  |j                  d      | _        | j                          y )Nr'   )r5   r6   rB  r(  r  rC   r   rB   rD   r   r9   r  rG  r  s      rP   r6   zBertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	rQ   NrR   rq   r3   r/   rS   ri  rs   rU   c           	         ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |f||||dd|}	|	d   }
| j	                  |
      }
| j                  |
      }|j                  d|      }d}|t               } |||      }t        |||	j                  |	j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr'   r1   Trl  r  )
r[   r   rJ   r(  rD   r  r   r   r   r)  )rM   rR   rq   r3   r/   rS   ri  rs   num_choicesrn  r  r  reshaped_logitsr<  rp  s                  rP   rb   zBertForMultipleChoice.forward  s   T -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 $))
))%'
 
  
]3/ ++b+6')HOV4D("!//))	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r    s      *..2.2,0-1&*N
<<$&N
 t+N
 t+	N

 llT)N
 ||d*N
 t#N
 +,N
 
u||	8	8N
  N
rQ   r  c                   *    e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee	   d
e
ej                     ez  fd              Z xZS )BertForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y NFrx  )r5   r6   r  rB  r(  r  rC   r   rB   rD   r   r9   r  rG  r  s      rP   r6   z#BertForTokenClassification.__init__  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rQ   NrR   rq   r3   r/   rS   ri  rs   rU   c           	      H    | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|<t               } ||
j	                  d| j
                        |j	                  d            }t        ||
|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Trl  r   Nr1   r  )	r(  rD   r  r   r   r  r   r   r)  )rM   rR   rq   r3   r/   rS   ri  rs   rn  r  r  r<  rp  s                rP   rb   z"BertForTokenClassification.forward  s      $))
))%'
 
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rQ   r   )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r    s      *..2.2,0-1&*'
<<$&'
 t+'
 t+	'

 llT)'
 ||d*'
 t#'
 +,'
 
u||	4	4'
  '
rQ   r  c                   J    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ee	   de
ej                     ez  fd              Z xZS )BertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r5   r6   r  rB  r(  r   r   r9   
qa_outputsrG  rL   s     rP   r6   z!BertForQuestionAnswering.__init__A  sU      ++f>	))F$6$68I8IJ 	rQ   NrR   rq   r3   r/   rS   start_positionsend_positionsrs   rU   c           	          | j                   |f||||dd|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }t        ||||	j                  |	j                  	      S )
NTrl  r   r'   r1   rw   )ignore_indexrv   )r<  start_logits
end_logitsr   r)  )r(  r  splitr  r~   lenrJ   clampr   r   r   r)  )rM   rR   rq   r3   r/   rS   r  r  rs   rn  r  r  r  r  ro  ignored_indexrp  
start_lossend_losss                      rP   rb   z BertForQuestionAnswering.forwardK  s    $))
))%'
 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
rQ   rs  )rc   rd   re   r6   r$   r"   rF   rj   r   r!   r   r   rb   rk   rl   s   @rP   r  r  ?  s      *..2.2,0-1/3-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 ,3
 ||d*3
 +,3
 
u||	;	;3
  3
rQ   r  )r  r  r  rY  r  r  r  r   rv  rB  r'  )Nr   )Zrf   collections.abcr   dataclassesr   rF   r   torch.nnr   r   r    r
   r-  activationsr   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr    r!   r"   r#   utils.genericr$   r%   utils.output_capturingr&   configuration_bertr(   
get_loggerrc   rz  Moduler*   rj   floatr   r   r   r   r   r   r   r   r   r   r  r  r  r  r#  r'  r;  rB  rY  rv  r  r  r  r  r  r  __all__r@  rQ   rP   <module>r     s    $ !   A A & ! C C ) J 9
 
 
 G & 6 M M I 5 * 
		H	%;RYY ;H !%II%<<% 
% <<	%
 LL4'% T\% % '(%8F)		 F)RJ) J)ZRYY .BII .>ryy  @* @F
")) 
D ")) "299  !bii !&bii &	9299 	9 // / /2 
7{ 7 7& 	J6# J6J6Z Y
, Y
Y
x 
S
)? S

S
l I
) I
 I
X 
I
$7 I

I
X M
$7 M
M
` ^
/ ^
 ^
B 8
!4 8
 8
v @
2 @
 @
FrQ   