
    i,                        d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z!  ejD                  e#      Z$ G d dejJ                        Z&e G d de             Z' G d dejJ                        Z( G d dejJ                        Z) G d dejJ                        Z* G d dejJ                        Z+ G d dejJ                        Z, G d dejJ                        Z- G d  d!ejJ                        Z. G d" d#e      Z/ G d$ d%ejJ                        Z0 G d& d'ejJ                        Z1 G d( d)ejJ                        Z2e G d* d+e'             Z3 G d, d-ejJ                        Z4e G d. d/e'             Z5 G d0 d1ejJ                        Z6 ed23       G d4 d5e'             Z7e G d6 d7e'             Z8e G d8 d9e'             Z9e G d: d;e'             Z:g d<Z;y)=zPyTorch ConvBERT model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNget_activation)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )ConvBertConfigc                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  f
d	Z xZ	S )ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr   sizelongselfconfig	__class__s     j/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/convbert/modeling_convbert.pyr'   zConvBertEmbeddings.__init__1   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`"f&;&;AVAVWzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr#   r   inputs_embedsreturnc                 2   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	| j                  |      }
||	z   |
z   }| j                  |      }| j                  |      }|S )Nr!   r   r#   r   r%   device)r;   r   hasattrr#   r9   r7   r:   r<   rH   r,   r.   r0   r1   r5   )r>   rC   r#   r   rD   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr.   r0   
embeddingss               rA   forwardzConvBertEmbeddings.forwardA   s,     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M"66|D $ : :> J"%88;PP
^^J/
\\*-
rB   )NNNN)
__name__
__module____qualname____doc__r'   r7   
LongTensorFloatTensorrO   __classcell__r@   s   @rA   r   r   .   s    Q
$ .2260426$##d*$ ((4/$ &&-	$
 ((4/$ 
		$rB   r   c                   Z     e Zd ZU eed<   dZdZ ej                          fd       Z	 xZ
S )ConvBertPreTrainedModelr?   convbertTc                 b   t         |   |       t        |t              r t	        j
                  |j                         yt        |t              rVt	        j                  |j                  d| j                  j                         t	        j
                  |j                         yt        |t              ryt	        j                  |j                  t        j                   |j                  j"                  d         j%                  d             t	        j
                  |j&                         yy)zInitialize the weights        meanstdr!   r    N)r&   _init_weights
isinstanceSeparableConv1Dinitzeros_biasGroupedLinearLayernormal_weightr?   initializer_ranger   copy_r   r7   r8   shaper9   r#   )r>   moduler@   s     rA   r`   z%ConvBertPreTrainedModel._init_weightsn   s     	f%fo.KK$ 23LLSdkk6S6STKK$ 23JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 4rB   )rP   rQ   rR   r   __annotations__base_model_prefixsupports_gradient_checkpointingr7   no_gradr`   rV   rW   s   @rA   rY   rY   h   s/    "&*#U]]_
/ 
/rB   rY   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )rb   zSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    t         |           t        j                  |||||dz  d      | _        t        j                  ||dd      | _        t        j                  t        j                  |d            | _	        | j                  j                  j                  j                  d|j                         | j
                  j                  j                  j                  d|j                         y )N   F)kernel_sizegroupspaddingre   r   )rt   re   r\   r]   )r&   r'   r   Conv1d	depthwise	pointwise	Parameterr7   r:   re   rh   datarg   ri   )r>   r?   input_filtersoutput_filtersrt   kwargsr@   s         rA   r'   zSeparableConv1D.__init__   s    # 1$
 =.aV[\LL^Q!?@	""**9Q9Q*R""**9Q9Q*RrB   hidden_statesrE   c                 h    | j                  |      }| j                  |      }|| j                  z  }|S N)rx   ry   re   )r>   r   xs      rA   rO   zSeparableConv1D.forward   s0    NN=)NN1	TYYrB   	rP   rQ   rR   rS   r'   r7   TensorrO   rV   rW   s   @rA   rb   rb   |   s'    ]S U\\ ell rB   rb   c                        e Zd Z fdZ	 	 	 d	dej
                  dej                  dz  dej
                  dz  dedz  deej
                  ej
                  dz  f   f
dZ	 xZ
S )
ConvBertSelfAttentionc                 j   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  |j                  z  }|dk  r|j                  | _        d| _        n|| _        |j                  | _        |j                  | _        |j                  | j                  z  dk7  rt        d      |j                  | j                  z  dz  | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        ||j                  | j                  | j                        | _        t        j                  | j                  | j                  | j                  z        | _        t        j                  |j                  | j                        | _        t        j&                  | j                  dgt)        | j                  dz
  dz        dg	      | _        t        j,                  |j.                        | _        y )
Nr   r*   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsrs   )rt   rv   )r&   r'   hidden_sizenum_attention_headsrI   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   Linearquerykeyvaluerb   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldr3   attention_probs_dropout_probr5   )r>   r?   new_num_attention_headsr@   s      rA   r'   zConvBertSelfAttention.__init__   s>    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)"<"<@Q@Q"Q"Q&$88DO'(D$'>D$$//DO & 7 7 8 88A=UVV$*$6$6$:R:R$RWX#X !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
#2F&&(:(:D<Q<Q$
  "$4+=+=t?W?WZ^ZoZo?o!p ii(:(:D<N<NOii..2S$BWBWZ[B[_`A`=acd<e
 zz&"E"EFrB   Nr   attention_maskencoder_hidden_statesoutput_attentionsrE   c                 f   |j                   \  }}}|#| j                  |      }| j                  |      }	n"| j                  |      }| j                  |      }	| j                  |j	                  dd            }
|
j	                  dd      }
| j                  |      }|j                  |d| j                  | j                        j	                  dd      }|j                  |d| j                  | j                        j	                  dd      }|	j                  |d| j                  | j                        j	                  dd      }t        j                  |
|      }| j                  |      }t        j                  |d| j                  dg      }t        j                  |d      }| j                  |      }t        j                  ||d| j                   g      }|j	                  dd      j#                         j%                  d      }t&        j(                  j+                  || j                  dgd| j                  dz
  dz  dgd      }|j	                  dd      j                  |d| j                   | j                        }t        j                  |d| j                  | j                  g      }t        j,                  ||      }t        j                  |d| j                   g      }t        j,                  ||j	                  dd            }|t/        j0                  | j                        z  }|||z   }t&        j(                  j                  |d      }| j3                  |      }t        j,                  ||      }|j5                  dddd      j#                         }t        j                  ||d| j                  | j                  g      }t        j6                  ||gd      }|j9                         d d | j                  | j                  z  dz  fz   } |j                  | }|r||f}|S |f}|S )	Nr   rs   r!   dimr   )rt   dilationrv   strider   )rk   r   r   r   	transposer   viewr   r   r7   multiplyr   reshaper   softmaxr   r   
contiguous	unsqueezer   
functionalr   matmulmathsqrtr5   permutecatr;   )r>   r   r   r   r   
batch_sizerK   _mixed_key_layermixed_value_layermixed_key_conv_attn_layermixed_query_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapeoutputss                           rA   rO   zConvBertSelfAttention.forward   s    %2$7$7!
J !,"hh'<=O $

+@ A"hh}5O $

= 9$($<$<]=T=TUVXY=Z$[!$=$G$G1$M! JJ}5',,D44d6N6N

)Aq/ 	 $((R9Q9QSWSkSklvvq
	 (,,D44d6N6N

)Aq/ 	  ..)BDUV 22?C!MM*;b$BWBWYZ=[\!MM*;C,,];~
BHZHZ7[\'11!Q7BBDNNrR--..2++a/A5q9 . 
 (11!Q7??D..0E0E
 ~D<T<TVZVkVk7lmn6GH~D<N<N7OP !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF==*b$BZBZ\`\t\t1uv		=(";Q? #0"4"4"6s";$$t'?'??!C?
 #
 +**,CD6G=/2 O\M]rB   NNFrP   rQ   rR   r'   r7   r   rU   booltuplerO   rV   rW   s   @rA   r   r      s    %GT 4859).Q||Q ))D0Q  %||d2	Q
  $;Q 
u||U\\D00	1QrB   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r&   r'   r   r   r   denser1   r2   r3   r4   r5   r=   s     rA   r'   zConvBertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rB   r   input_tensorrE   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r5   r1   r>   r   r   s      rA   rO   zConvBertSelfOutput.forward  7    

=1]3}|'CDrB   rP   rQ   rR   r'   r7   r   rO   rV   rW   s   @rA   r   r     s1    >U\\  RWR^R^ rB   r   c                        e Zd Z fdZ	 	 	 d	dej
                  dej                  dz  dej
                  dz  dedz  deej
                  ej                  dz  f   f
dZ	 xZ
S )
ConvBertAttentionc                 b    t         |           t        |      | _        t	        |      | _        y r   )r&   r'   r   r>   r   outputr=   s     rA   r'   zConvBertAttention.__init__!  s&    )&1	(0rB   Nr   r   r   r   rE   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r>   r   )r>   r   r   r   r   self_outputsattention_outputr   s           rA   rO   zConvBertAttention.forward&  sN     yy!	
  ;;|AF#%QR(88rB   r   r   rW   s   @rA   r   r      s}    1 4859).|| ))D0  %||d2	
  $; 
u||U..55	6rB   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )rf   c                    t         |           || _        || _        || _        | j                  | j                  z  | _        | j                  | j                  z  | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        j                  t        j                  |            | _        y r   )r&   r'   
input_sizeoutput_size
num_groupsgroup_in_dimgroup_out_dimr   rz   r7   emptyrh   re   )r>   r   r   r   r@   s       rA   r'   zGroupedLinearLayer.__init__9  s    $&$ OOt>!--@ll5;;t@Q@QSWSeSe#fgLL[!9:	rB   r   rE   c                    t        |j                               d   }t        j                  |d| j                  | j
                  g      }|j                  ddd      }t        j                  || j                        }|j                  ddd      }t        j                  ||d| j                  g      }|| j                  z   }|S )Nr   r!   r   rs   )listr;   r7   r   r   r   r   r   rh   r   re   )r>   r   r   r   s       rA   rO   zGroupedLinearLayer.forwardC  s    -,,./2
MM-"doot?P?P)QRIIaALLDKK(IIaAMM!j"d.>.>?@		MrB   r   rW   s   @rA   rf   rf   8  s#    ;U\\ ell rB   rf   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertIntermediatec                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y )Nr   r   r   r   )r&   r'   r   r   r   r   intermediate_sizer   rf   ra   
hidden_actstrr
   intermediate_act_fnr=   s     rA   r'   zConvBertIntermediate.__init__O  s    !6#5#5v7O7OPDJ+!--6;S;S`f`q`qDJ f''-'-f.?.?'@D$'-'8'8D$rB   r   rE   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r>   r   s     rA   rO   zConvBertIntermediate.forward\  s&    

=100?rB   r   rW   s   @rA   r   r   N  s#    9U\\ ell rB   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertOutputc                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        y )Nr   r   r   )r&   r'   r   r   r   r   r   r   rf   r1   r2   r3   r4   r5   r=   s     rA   r'   zConvBertOutput.__init__c  s    !6#;#;V=O=OPDJ+!33ASAS`f`q`qDJ f&8&8f>S>STzz&"<"<=rB   r   r   rE   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rA   rO   zConvBertOutput.forwardn  r   rB   r   rW   s   @rA   r   r   b  s1    	>U\\  RWR^R^ rB   r   c                        e Zd Z fdZ	 	 	 	 ddej
                  dej                  dz  dej
                  dz  dej
                  dz  dedz  deej
                  ej                  dz  f   fd	Z	d
 Z
 xZS )ConvBertLayerc                 b   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r*| j                  st        |  d      t	        |      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is added)r&   r'   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr   intermediater   r   r=   s     rA   r'   zConvBertLayer.__init__v  s    '-'E'E$*62 ++#)#=#= ##??4&(f ghh"3F";D08$V,rB   Nr   r   r   encoder_attention_maskr   rE   c                 :   | j                  |||      }|d   }|dd  }| j                  r>|<t        | d      st        d|  d      | j	                  ||||      }	|	d   }||	dd  z   }t        | j                  | j                  | j                  |      }
|
f|z   }|S )N)r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r   r   rI   AttributeErrorr   r   feed_forward_chunkr   r   )r>   r   r   r   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs              rA   rO   zConvBertLayer.forward  s     "&/ "0 "

 2!4(,??4@4!12$=dV DD D  '+&9&9 &%!	'#  7q9 7 ;;G0##T%A%A4CSCSUe
  /G+rB   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r>   r   intermediate_outputr  s       rA   r   z ConvBertLayer.feed_forward_chunk  s,    "//0@A{{#68HIrB   )NNNF)rP   rQ   rR   r'   r7   r   rU   r   r   rO   r   rV   rW   s   @rA   r   r   u  s    -" 48596:).#||# ))D0#  %||d2	#
 !&t 3#  $;# 
u||U..55	6#JrB   r   c                        e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej                  dz  dej
                  dz  dej
                  dz  dedz  dedz  d	edz  d
ee	z  fdZ
 xZS )ConvBertEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r&   r'   r?   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r>   r?   r   r@   s      rA   r'   zConvBertEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#Nr   r   r   r   r   output_hidden_statesreturn_dictrE   c                 t   |rdnd }|rdnd }	|r| j                   j                  rdnd }
t        | j                        D ]J  \  }}|r||fz   } ||||||      }|d   }|s"|	|d   fz   }	| j                   j                  sB|
|d   fz   }
L |r||fz   }|st	        d |||	|
fD              S t        |||	|
      S )N r   r   rs   c              3   $   K   | ]  }|| 
 y wr   r  ).0vs     rA   	<genexpr>z*ConvBertEncoder.forward.<locals>.<genexpr>  s      = s   )last_hidden_stater   
attentionscross_attentions)r?   r   	enumerater
  r   r   )r>   r   r   r   r   r   r  r  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_outputss                 rA   rO   zConvBertEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d(4 	VOA|#$58H$H!(%&!M *!,M &9]1=M<O&O#;;22+?=QRCSBU+U(	V"   1]4D D '):<OQef  
 2++*1	
 	
rB   )NNNFFT)rP   rQ   rR   r'   r7   r   rU   r   r   r   rO   rV   rW   s   @rA   r  r    s    , 48596:).,1#',
||,
 ))D0,
  %||d2	,

 !&t 3,
  $;,
 #Tk,
 D[,
 
3	3,
rB   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r&   r'   r   r   r   r   ra   r   r   r
   transform_act_fnr1   r2   r=   s     rA   r'   z(ConvBertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrB   r   rE   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r!  r1   r   s     rA   rO   z'ConvBertPredictionHeadTransform.forward  s4    

=1--m<}5rB   r   rW   s   @rA   r  r    s$    UU\\ ell rB   r  c                        e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dej                  fdZ	 xZ
S )
ConvBertSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ConvBertConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r?   c                 f   t         |           t        |dd      | _        | j                  dk(  rt        t        j                         | _        t        |d      rq|j                  ret        |d      r(|j                  r|j                  dkD  r|j                  }n|j                  }t        j                  |j                  |      | _        t        |dd       }|rt        |      nt        j                         | _        t        j                         | _        t        |d      r3|j"                  dkD  r$t        j$                  |j"                        | _        t        j                         | _        t        |d	      r5|j(                  dkD  r%t        j$                  |j(                        | _        y y y )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r&   r'   getattrr&  NotImplementedErrorr   IdentitysummaryrI   r)  r*  
num_labelsr   r   r   
activationfirst_dropoutr,  r3   last_dropoutr-  )r>   r?   num_classesactivation_stringr@   s       rA   r'   z ConvBertSequenceSummary.__init__  sU   #FNFC& &%{{}6-.63J3Jv78V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]6238T8TWX8X!#F,H,H!IDKKM612v7R7RUV7V "

6+F+F GD 8W2rB   Nr   	cls_indexrE   c                    | j                   dk(  r|dddf   }n| j                   dk(  r|dddf   }n| j                   dk(  r|j                  d      }n| j                   d	k(  r|At        j                  |d
ddddf   |j                  d   dz
  t        j
                        }nX|j                  d      j                  d      }|j                  d|j                         dz
  z  |j                  d      fz         }|j                  d|      j                  d      }n| j                   dk(  rt        | j                        }| j                  |      }| j                  |      }| j!                  |      }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r'  Nr!   firstr   r^   r   r   r8  .r   r$   )r!   r(  )r&  r^   r7   	full_likerk   r<   r   r9   r   r;   gathersqueezer/  r4  r1  r3  r5  )r>   r   r8  r   s       rA   rO   zConvBertSequenceSummary.forward.  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rB   r   )rP   rQ   rR   rS   r   r'   r7   rU   rT   rO   rV   rW   s   @rA   r$  r$    sQ    2H~ H< VZ)"..);@;K;Kd;R)			)rB   r$  c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
dz  de
dz  de
dz  deez  fd       Z xZS )ConvBertModelc                 "   t         |   |       t        |      | _        |j                  |j
                  k7  r/t        j                  |j                  |j
                        | _        t        |      | _
        || _        | j                          y r   )r&   r'   r   rN   r*   r   r   r   embeddings_projectr  encoderr?   	post_initr=   s     rA   r'   zConvBertModel.__init__\  sl     ,V4  F$6$66&(ii0E0EvGYGY&ZD#&v.rB   c                 .    | j                   j                  S r   rN   r,   r>   s    rA   get_input_embeddingsz"ConvBertModel.get_input_embeddingsh  s    ...rB   c                 &    || j                   _        y r   rE  )r>   r   s     rA   set_input_embeddingsz"ConvBertModel.set_input_embeddingsk  s    */'rB   NrC   r   r#   r   rD   r   r  r  rE   c	                 T   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  |
|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j                  ||||      }t        | d      r| j#                  |      }| j%                  |||||	      }|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer!   z5You have to specify either input_ids or inputs_embeds)rH   r#   rG   )rC   r   r#   rD   rA  )r   r   r  r  )r?   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr;   rH   r7   onesrI   rN   r#   r9   r:   r<   get_extended_attention_maskrA  rB  )r>   rC   r   r#   r   rD   r   r  r  r~   rJ   r   rK   rH   rL   rM   extended_attention_maskr   s                     rA   rO   zConvBertModel.forwardn  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZFCN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z"&"B"B>S^"_l>iv ( 
 4-. 33MBM2/!5# % 
 rB   )NNNNNNNN)rP   rQ   rR   r'   rG  rI  r   r7   rT   rU   r   r   r   rO   rV   rW   s   @rA   r?  r?  Z  s    
/0  .237260426)-,0#':##d*: ))D0: ((4/	:
 &&-: ((4/:  $;: #Tk: D[: 
3	3: :rB   r?  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                     t         |           t        d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _
        y )Ngelur   )r&   r'   r   r3  r   r1   r*   r2   r   r   r   r=   s     rA   r'   z%ConvBertGeneratorPredictions.__init__  sV    (0f&;&;AVAVWYYv1163H3HI
rB   generator_hidden_statesrE   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r3  r1   )r>   rT  r   s      rA   rO   z$ConvBertGeneratorPredictions.forward  s3    

#:;6}5rB   )	rP   rQ   rR   rS   r'   r7   rU   rO   rV   rW   s   @rA   rQ  rQ    s+    KJu/@/@ UEVEV rB   rQ  c                   4    e Zd ZddiZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  deez  fd       Z xZS )ConvBertForMaskedLMzgenerator_lm_head.weightz*convbert.embeddings.word_embeddings.weightc                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  |j                        | _
        | j                          y r   )r&   r'   r?  rZ   rQ  generator_predictionsr   r   r*   r)   generator_lm_headrC  r=   s     rA   r'   zConvBertForMaskedLM.__init__  sR     %f-%A&%I"!#6+@+@&BSBS!TrB   c                     | j                   S r   rZ  rF  s    rA   get_output_embeddingsz)ConvBertForMaskedLM.get_output_embeddings  s    %%%rB   c                     || _         y r   r\  )r>   r,   s     rA   set_output_embeddingsz)ConvBertForMaskedLM.set_output_embeddings  s
    !0rB   NrC   r   r#   r   rD   labelsr   r  r  rE   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|Pt        j                         } ||j                  d| j                   j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r!   r   losslogitsr   r  )r?   rK  rZ   rY  rZ  r   r   r   r)   r   r   r  )r>   rC   r   r#   r   rD   r`  r   r  r  r~   rT  generator_sequence_outputprediction_scoresrc  loss_fctr   s                    rA   rO   zConvBertForMaskedLM.forward  s   ( &1%<k$++B]B]"&-- 	#
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D'),CAB,GGF)-)9TGf$EvE$1??.99	
 	
rB   	NNNNNNNNN)rP   rQ   rR   _tied_weights_keysr'   r]  r_  r   r7   rT   rU   r   r   r   rO   rV   rW   s   @rA   rW  rW    s   46bc&1  .237260426*.)-,0#'3
##d*3
 ))D03
 ((4/	3

 &&-3
 ((4/3
   4'3
  $;3
 #Tk3
 D[3
 
	3
 3
rB   rW  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                 h   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        || _        y r   )r&   r'   r   r   r   r   classifier_dropoutr4   r3   r5   r2  out_projr?   r>   r?   rm  r@   s      rA   r'   z#ConvBertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrB   r   rE   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r5   r   r
   r?   r   rn  )r>   r   r~   r   s       rA   rO   z"ConvBertClassificationHead.forward  se    !Q'"LLOJJqM4;;))*1-LLOMM!rB   r   rW   s   @rA   rk  rk  	  s&    7	U\\  rB   rk  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )!ConvBertForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        | j                          y r   )	r&   r'   r2  r?   r?  rZ   rk  
classifierrC  r=   s     rA   r'   z*ConvBertForSequenceClassification.__init__(  sH      ++%f-4V< 	rB   NrC   r   r#   r   rD   r`  r   r  r  rE   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r#   r   rD   r   r  r  r   r   
regressionsingle_label_classificationmulti_label_classificationr!   rb  )r?   rK  rZ   ru  problem_typer2  r%   r7   r<   r   r   r=  r   r   r   r   r   r  r>   rC   r   r#   r   rD   r`  r   r  r  r~   r   sequence_outputrd  rc  rg  r   s                    rA   rO   z)ConvBertForSequenceClassification.forward2  s   ( &1%<k$++B]B]--))%'/!5#   	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rB   rh  )rP   rQ   rR   r'   r   r7   rT   rU   r   r   r   rO   rV   rW   s   @rA   rs  rs  !  s      .237260426*.)-,0#'C
##d*C
 ))D0C
 ((4/	C

 &&-C
 ((4/C
   4'C
  $;C
 #TkC
 D[C
 
)	)C
 C
rB   rs  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )ConvBertForMultipleChoicec                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  d      | _	        | j                          y )Nr   )r&   r'   r?  rZ   r$  sequence_summaryr   r   r   ru  rC  r=   s     rA   r'   z"ConvBertForMultipleChoice.__init__{  sM     %f- 7 ?))F$6$6: 	rB   NrC   r   r#   r   rD   r`  r   r  r  rE   c
           
      J   |	|	n| j                   j                  }	||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r!   r   rw  r   rb  )r?   rK  rk   r   r;   rZ   r  ru  r   r   r   r  )r>   rC   r   r#   r   rD   r`  r   r  r  r~   num_choicesr   r}  pooled_outputrd  reshaped_logitsrc  rg  r   s                       rA   rO   z!ConvBertForMultipleChoice.forward  s   Z &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 --))%'/!5#   	
 "!*--o>/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rB   rh  )rP   rQ   rR   r'   r   r7   rT   rU   r   r   r   rO   rV   rW   s   @rA   r  r  y  s      .237260426*.)-,0#'X
##d*X
 ))D0X
 ((4/	X

 &&-X
 ((4/X
   4'X
  $;X
 #TkX
 D[X
 
*	*X
 X
rB   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )ConvBertForTokenClassificationc                 `   t         |   |       |j                  | _        t        |      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r   )r&   r'   r2  r?  rZ   rm  r4   r   r3   r5   r   r   ru  rC  ro  s      rA   r'   z'ConvBertForTokenClassification.__init__  s      ++%f-)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rB   NrC   r   r#   r   rD   r`  r   r  r  rE   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrw  r   r!   r   rb  )r?   rK  rZ   r5   ru  r   r   r2  r   r   r  r|  s                    rA   rO   z&ConvBertForTokenClassification.forward  s    $ &1%<k$++B]B]--))%'/!5#   	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rB   rh  )rP   rQ   rR   r'   r   r7   rT   rU   r   r   r   rO   rV   rW   s   @rA   r  r    s      .237260426*.)-,0#'1
##d*1
 ))D01
 ((4/	1

 &&-1
 ((4/1
   4'1
  $;1
 #Tk1
 D[1
 
&	&1
 1
rB   r  c                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )ConvBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r&   r'   r2  r?  rZ   r   r   r   
qa_outputsrC  r=   s     rA   r'   z%ConvBertForQuestionAnswering.__init__(  sS      ++%f-))F$6$68I8IJ 	rB   NrC   r   r#   r   rD   start_positionsend_positionsr   r  r  rE   c           
      &   |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nrw  r   r   r!   r   )ignore_indexrs   )rc  start_logits
end_logitsr   r  )r?   rK  rZ   r  splitr=  r   lenr;   clampr   r   r   r  )r>   rC   r   r#   r   rD   r  r  r   r  r  r~   r   r}  rd  r  r  
total_lossignored_indexrg  
start_lossend_lossr   s                          rA   rO   z$ConvBertForQuestionAnswering.forward2  s    &1%<k$++B]B]--))%'/!5#   	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rB   )
NNNNNNNNNN)rP   rQ   rR   r'   r   r7   rT   rU   r   r   r   rO   rV   rW   s   @rA   r  r  &  s     .2372604263715)-,0#'=
##d*=
 ))D0=
 ((4/	=

 &&-=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
-	-=
 =
rB   r  )rW  r  r  rs  r  r   r?  rY   )<rS   r   collections.abcr   r7   r   torch.nnr   r   r    r	   rc   activationsr
   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_convbertr   
get_loggerrP   loggerModuler   rY   rb   r   r   r   rf   r   r   r   r  r  r$  r?  rQ  rW  rk  rs  r  r  r  __all__r  rB   rA   <module>r     s'     $   A A & 1 9  . 6 3 
		H	%7 7t /o / /&bii 4yBII yx 		 0 ,299 (RYY &7. 7t3
bii 3
lbii $`bii `F N+ N Nb299 $ G
1 G
 G
T 0 O
(? O
O
d d
 7 d
 d
N A
%< A
 A
H I
#: I
 I
X	rB   