
    iW                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/  e+j`                  e1      Z2d Z3 G d dejh                        Z5 G d dejh                        Z6 G d de      Z7 G d dejh                        Z8e* G d de%             Z9e e*d !       G d" d#e)                    Z:e* G d$ d%e9             Z; e*d&!       G d' d(e9e             Z< e*d)!       G d* d+e9e             Z= e*d,!       G d- d.e9             Z>e* G d/ d0e9             Z?e* G d1 d2e9             Z@g d3ZAy)4zPyTorch OpenAI GPT-2 model.    N)Callable)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNget_activation)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Conv1D)ModelOutputauto_docstringlogging)maybe_autocast   )
GPT2Configc                 8   t        j                  ||j                  dd            }| j                  rB|t        j                  g |j                  d      dz  |j                  |j                        z  }| j                  r|t        | j                  dz         z  }|||z   }t        j                  j                  |d      }|j                  |j                        }| j                  |      }t        j                  ||      }|j                  dd      }||fS )N      ?dtypedevicer    dim   )torchmatmul	transposescale_attn_weightsfullsizer'   r(   scale_attn_by_inverse_layer_idxfloat	layer_idxr   
functionalsoftmaxtypeattn_dropout)modulequerykeyvalueattention_maskkwargsattn_weightsattn_outputs           b/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_gpt2.pyeager_attention_forwardrB   4   s    <<s}}R'<=L  #ejj

2#%\-?-?H[H['
 

 --#eF,<,<q,@&AA!#n4==((2(>L  $$U[[1L&&|4L,,|U3K''1-K$$    c                   0    e Zd Zd fd	ZddZ	 	 	 	 	 	 ddeej                     dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  deej                  eej                     z  df   fdZ xZS )GPT2AttentionNc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      |j                  | _
        || _        |j                  | _        || _        |j                  | _        | j                  rNt        d| j                  z  | j                        | _        t        | j                  | j                        | _        n(t        d| j                  z  | j                        | _        t        | j                  | j                        | _        t'        j(                  |j*                        | _        t'        j(                  |j.                        | _        | | _        y )Nz=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r+   r	   )super__init__confighidden_size	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorr/   is_cross_attentionr2   r4   reorder_and_upcast_attnr   c_attnq_attnc_projr   Dropout
attn_pdropr8   resid_pdropresid_dropout	is_causal)selfrI   rQ   r4   	__class__s       rA   rH   zGPT2Attention.__init__Q   sz   ++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;//rC   c                 &   |j                         \  }}}}|j                         \  }	}	}
}	t        j                  ||z  ||
t        j                  |j                        }d}| j
                  r |t        |j                  d            dz  z  }| j                  r|t        | j                  dz         z  }t        |j                  j                  d      5  |j                  d||      |j                  dd      j                  d||
      }}t        j                  ||j                         |j                         d	|
      }|j                  ||||
      }d d d        |||z   }t        j                  j!                  |d      }|j"                  t        j                  k7  rt%        d      |j                  |j"                        }| j'                  |      }t        j(                  ||      }|j                  dd      }||fS # 1 sw Y   xY w)Nr&   g      ?r#   r%   r    F)enabledr$   r   )betaalphar)   zDError with upcasting, attn_weights does not have dtype torch.float32r+   )r1   r,   emptyfloat32r(   r/   r3   r2   r4   r   r7   reshaper.   baddbmmr   r5   r6   r'   RuntimeErrorr8   r-   )r[   r:   r;   r<   r=   bszrM   	q_seq_lendk_	k_seq_lenr?   scale_factorqkr@   s                   rA   _upcast_and_reordered_attnz(GPT2Attention._upcast_and_reordered_attnq   s   (-

%Y	2 XXZ1i {{3?IyPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L ELL--u= 	V==Y3S]]2r5J5R5RSUWY[d5eqA ==qwwy!'')RS[ghL'//Y	9UL	V
 %'.8L}},,\r,B .eff#((5((6ll<7!++Aq1L(()	V 	Vs   BHHhidden_statespast_key_valuescache_positionr=   encoder_hidden_statesencoder_attention_maskoutput_attentionsreturn.c                    |d u}	|St        |t              rA|j                  j                  | j                        }
|	r|j
                  }n|j                  }n|}|	rt        | d      st        d      | j                  |      }|}|J
rHj                  | j                     j                  }|j                  | j                     j                  }n$| j                  |      j                  | j                  d      \  }}g |j                   d d d| j"                  }|j%                  |      j'                  dd      }|j%                  |      j'                  dd      }n| j                  |      j                  | j                  d      \  }}}g |j                   d d d| j"                  }|j%                  |      j'                  dd      }|j%                  |      j'                  dd      }g |j                   d d d| j"                  }|j%                  |      j'                  dd      }||	r|H|	rF
sD|	s|nd }j)                  ||| j                  d|i      \  }}|	rd|j                  | j                  <   | j*                  j,                  d	k(  }t/        j0                  | j*                  j,                  t2              }|r$| j4                  r| j7                  ||||      \  }}n4 || ||||fd
| j8                  r| j:                  j<                  ndi|\  }} |j>                  g |j                   d d d jA                         }| jC                  |      }| jE                  |      }||fS )NrT   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`.r+   r)   r#   r    rq   Teagerdropout        r$   )#
isinstancer   
is_updatedgetr4   cross_attention_cacheself_attention_cachehasattrrP   rT   layerskeysvaluesrS   splitrO   shaperN   viewr.   updaterI   _attn_implementationr   get_interfacerB   rR   rn   trainingr8   prc   
contiguousrU   rY   )r[   ro   rp   rq   r=   rr   rs   rt   r>   rQ   r{   curr_past_key_valuesquery_states
key_statesvalue_statesshape_kvshape_qusing_eagerattention_interfacer@   r?   s                        rA   forwardzGPT2Attention.forward   s    3$>&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$4* p   ;;}5L3N *z188HMM
3::4>>JQQ+/;;7L+M+S+STXTcTcij+S+k(
LFZ--cr2FBFF'__X6@@AF
+00:DDQJ59[[5O5U5UVZVeVekl5U5m2L*lB))#2.BBDMMBH#2<<QBJ',,X6@@AFL?L&&s+?R??#((1;;AqA'0B',>z 4F^4N';'B'BL$..;K^:\($J "=A**4>>:kk66'A(?(M(MKK,,.E)
 477(,(G(Gj,)%K )<) 04}}))++#) )%K *k))F;+<+<Sb+AF2FQQSkk+.((5L((rC   )FNN)NNNNNF)__name__
__module____qualname__rH   rn   tupler,   FloatTensorr   
LongTensorTensorboolr   __classcell__r\   s   @rA   rE   rE   P   s    0@%)T )-263759;?).V)U../$6V) V) ((4/	V)
 ))D0V)  %||d2V) !& 1 1D 8V)  $;V) 
u||eELL1136	7V)rC   rE   c                   b     e Zd Z fdZdeej                     dz  dej                  fdZ xZS )GPT2MLPc                     t         |           |j                  }t        ||      | _        t        ||      | _        t        |j                     | _        t        j                  |j                        | _        y r   )rG   rH   rJ   r   c_fcrU   r   activation_functionactr   rV   rX   rx   )r[   intermediate_sizerI   rK   r\   s       rA   rH   zGPT2MLP.__init__   s_    &&	,i8	Y(9:&445zz&"4"45rC   ro   Nru   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   rU   rx   )r[   ro   s     rA   r   zGPT2MLP.forward   s@    		-0/M2]3rC   )	r   r   r   rH   r   r,   r   r   r   r   s   @rA   r   r      s1    6U5+<+<%=%D IZIZ rC   r   c                   X    e Zd Zd fd	Z	 	 	 	 	 	 	 ddeej                     dz  dedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  deej                     eej                  eej                  df   f   z  dz  fdZ xZS )	GPT2BlockNc                    t         |           |j                  }|j                  |j                  nd|z  }t	        j
                  ||j                        | _        t        ||      | _	        t	        j
                  ||j                        | _
        |j                  r9t        |d|      | _        t	        j
                  ||j                        | _        t        ||      | _        y )N   eps)rI   r4   T)rI   rQ   r4   )rG   rH   rJ   n_innerr   	LayerNormlayer_norm_epsilonln_1rE   attnln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)r[   rI   r4   rJ   	inner_dimr\   s        rA   rH   zGPT2Block.__init__  s    ((&,nn&@FNNa+o	LL&2K2KL	!9E	LL&2K2KL	%%"/vRVbk"lD!#kv?X?X!YD9f-rC   ro   rp   rq   r=   rr   rs   	use_cachert   ru   .c	           	         |}
| j                  |      } | j                  |f|||||d|	\  }}||
z   }|Mt        | d      st        d|  d      |}
| j	                  |      }| j                  ||||||      \  }}|
|z   }|}
| j                  |      }| j                  |      }|
|z   }|f}|r||fz  }||fz  }|S )N)rp   rq   r=   r   rt   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rp   r=   rr   rs   rt   )r   r   r   rP   r   r   r   r   )r[   ro   rp   rq   r=   rr   rs   r   rt   r>   residualr@   self_attn_weightscross_attn_outputcross_attn_weightsfeed_forward_hidden_statesoutputss                    rA   r   zGPT2Block.forward  s>    !		-0)2*
+))/*
 *
&& $h. ,4!12 =dV DZ Z  %H ..}=M484G4G /-&;'="3 5H 511 %'88M 		-0%)XXm%<" #== ")++G$0.00rC   r   )NNNNNFF)r   r   r   rH   r   r,   r   r   r   r   r   r   r   r   s   @rA   r   r     s    .$ )-263759;?!&).:U../$6: : ((4/	:
 ))D0:  %||d2: !& 1 1D 8: $;:  $;: 
u||	uU\\59J9JC9O3P%PQ	QTX	X:rC   r   c                        e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dej                  fdZ	 xZ
S )
GPT2SequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`GPT2Config`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    rI   c                 f   t         |           t        |dd      | _        | j                  dk(  rt        t        j                         | _        t        |d      rq|j                  ret        |d      r(|j                  r|j                  dkD  r|j                  }n|j                  }t        j                  |j                  |      | _        t        |dd       }|rt        |      nt        j                         | _        t        j                         | _        t        |d      r3|j"                  dkD  r$t        j$                  |j"                        | _        t        j                         | _        t        |d	      r5|j(                  dkD  r%t        j$                  |j(                        | _        y y y )
Nsummary_typelastr   summary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)rG   rH   getattrr   NotImplementedErrorr   Identitysummaryr   r   r   
num_labelsrJ   Linearr   
activationfirst_dropoutr   rV   last_dropoutr   )r[   rI   num_classesactivation_stringr\   s       rA   rH   zGPT2SequenceSummary.__init__j  sU   #FNFC& &%{{}6-.63J3Jv78V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]6238T8TWX8X!#F,H,H!IDKKM612v7R7RUV7V "

6+F+F GD 8W2rC   Nro   	cls_indexru   c                    | j                   dk(  r|dddf   }n| j                   dk(  r|dddf   }n| j                   dk(  r|j                  d      }n| j                   d	k(  r|At        j                  |d
ddddf   |j                  d   dz
  t        j
                        }nX|j                  d      j                  d      }|j                  d|j                         dz
  z  |j                  d      fz         }|j                  d|      j                  d      }n| j                   dk(  rt        | j                        }| j                  |      }| j                  |      }| j!                  |      }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r   Nr#   firstr   meanr    r)   r   .r$   )r'   r#   r   )r   r   r,   	full_liker   long	unsqueezeexpandr*   r1   gathersqueezer   r   r   r   r   )r[   ro   r   outputs       rA   r   zGPT2SequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rC   r   )r   r   r   __doc__r!   rH   r,   r   r   r   r   r   s   @rA   r   r   P  sQ    2Hz H< VZ)"..);@;K;Kd;R)			)rC   r   c                   p    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZddgZ ej                         d        Zy	)
GPT2PreTrainedModelrI   transformerTr   rp   z	attn.biaszcrossattention.biasc           
      
   t        |t        j                  t        f      rct	        j
                  |j                  d| j                  j                         |j                  t	        j                  |j                         nt        |t        j                        rt	        j
                  |j                  d| j                  j                         |j                  t        |j                  dd      st	        j                  |j                  |j                            nXt        |t        j                        r>t	        j                  |j                         t	        j                  |j                         t        |t               rx|j#                         D ]d  \  }}|dk(  st	        j
                  |d| j                  j                  t%        j&                  d| j                  j(                  z        z         f yy)zInitialize the weights.ry   )r   stdN_is_hf_initializedFzc_proj.weightr+   )rz   r   r   r   initnormal_weightrI   initializer_rangebiaszeros_	Embeddingpadding_idxr   r   ones_r   named_parametersmathsqrtn_layer)r[   r9   namer   s       rA   _init_weightsz!GPT2PreTrainedModel._init_weights  sV    fryy&12LLSdkk6S6ST{{&FKK(-LLSdkk6S6ST!!-gfmmMach6iFMM&*<*<=>-KK$JJv}}% fo.!224 va?*LL$++2O2ORVR[R[\]`d`k`k`s`s\sRt2tuv /rC   N)r   r   r   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_attention_backend_can_compile_fullgraph"_keys_to_ignore_on_load_unexpectedr,   no_gradr    rC   rA   r   r     sc    %&*#$"3N"&! +67L)M&U]]_v vrC   r   z^
    Base class for outputs of models predicting if two sentences are consecutive or not.
    )custom_introc                   "   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZedz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed	<   y)
GPT2DoubleHeadsModelOutputa\  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
        Multiple choice classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
        Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    Nlossmc_losslogits	mc_logitsrp   ro   
attentions)r   r   r   r   r  r,   r   r   r  r	  r
  rp   r   ro   r   r  r  rC   rA   r  r    s      &*D%

d
")(,GU%,'+FE$+*.Iu  4'.$(OUT\(59M5**+d2926Je''(4/6rC   r  c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  deez  fd       Z xZS )	GPT2Modelc           	         t         |   |       |j                  | _        t	        j
                  |j                  | j                        | _        t	        j
                  |j                  | j                        | _	        t	        j                  |j                        | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t	        j$                  | j                  |j&                        | _        d| _        |j,                  | _        | j/                          y c c}w )N)r4   r   F)rG   rH   rJ   rK   r   r   
vocab_sizewtemax_position_embeddingswperV   
embd_pdropdrop
ModuleListrangenum_hidden_layersr   hr   r   ln_fgradient_checkpointingr   	post_init)r[   rI   ir\   s      rA   rH   zGPT2Model.__init__  s     ++<< 1 14>>B<< > >OJJv001	fNfNfHgh1	&A >hiLLV5N5NO	&+#$*$?$?! 	  is   
Ec                     | j                   S r   r  )r[   s    rA   get_input_embeddingszGPT2Model.get_input_embeddings  s    xxrC   c                     || _         y r   r  )r[   new_embeddingss     rA   set_input_embeddingszGPT2Model.set_input_embeddings  s	    !rC   N	input_idsrp   rq   r=   token_type_idsposition_idsinputs_embedsrr   rs   r   rt   output_hidden_statesreturn_dictru   c           	         ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }||t        d      |G| j                  ||       |j                         }|j                  d|d         }|j                  d   }n0|#|j                         dd }|j                  d   }nt        d      ||j                  d|d         }| j                  r%| j                  r|
rt        j                  d       d}
|
r^|t        | j                         }| j                   j                  r0t!        |t"              s t#        |t        | j                               }|| j%                  |      }|F||j'                         nd}t)        j*                  |||j                  d	   z   |j,                  
      }||j/                  d      }| j1                  |      }||j3                  |j,                        z   }|!|j4                  dk  r|j                  |d      }t7        | j                   |||||      }d}	|t9        | j                   ||	|      }	|| j%                  |      }||z   }| j;                  |      }d|d	d z   |j                  d      fz   }|rdnd}|r| j                   j                  rdnd}|rdnd}t=        | j>                        D ]j  \  }}|r||fz   } ||| j                  r| j                  s|nd|||f|	|
||d|}|d   }|sB||d	   fz   }| j                   j                  sb||d   fz   }l | jA                  |      }|j                  |      }|r||fz   }|
r|nd}|stC        d |||||fD              S tE        |||||      S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rI   r    r(   r   )rI   r&  r=   rq   rp   r%  )rI   r&  r=   rr   r   r  )rs   r   rt   r%  r+   c              3   $   K   | ]  }|| 
 y wr   r  ).0vs     rA   	<genexpr>z$GPT2Model.forward.<locals>.<genexpr>  s      = s   )last_hidden_staterp   ro   r  cross_attentions)#rI   rt   r'  r   use_return_dictrP   %warn_if_padding_and_no_attention_maskr1   r   r   r  r   loggerwarning_oncer   r   rz   r   r  get_seq_lengthr,   aranger(   r   r  tondimr   r   r  	enumerater  r  r   r   )r[   r#  rp   rq   r=   r$  r%  r&  rr   rs   r   rt   r'  r(  r>   input_shape
batch_sizepast_seen_tokensposition_embedsro   causal_masktoken_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr  blockr   s                                rA   r   zGPT2Model.forward  sL   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%+00[_EN&&4==##p "	 &".dkk"B{{..z/Sf7g"5o|[_[f[fGg"h  HHY/M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L((<0%(:(:=;O;O(PP %.*=*=*A+00R@N(;;'))+%
 "& ,%>{{+5&;	&" % $ 8),==M		-0{12.-2D2DR2H1JJ$5b4%64;;;Z;Zr`d"6BD!$&&) 	PHAu#$58H$H!(,(C(C\`% (>#"3) G $AJM &9WQZM&I#;;22+?71:-+O(-	P0 		-0%**<8 1]4D D-6/D ':KM`bvw   9+++*1
 	
rC   )NNNNNNNNNNNNN)r   r   r   rH   r  r"  r   r,   r   r   r   r   r   r   r   r   r   r   s   @rA   r  r    sY   $"  .2(,263726042659;?!%)-,0#'a
##d*a
 a
 ((4/	a

 ))D0a
 ((4/a
 &&-a
 ((4/a
  %||d2a
 !& 1 1D 8a
 $;a
  $;a
 #Tka
 D[a
  
:	:!a
 a
rC   r  z
    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            #           e Zd ZddiZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  deej                  z  deez  f d       Z xZS )GPT2LMHeadModellm_head.weighttransformer.wte.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr   )
rG   rH   r  r   r   r   n_embdr  lm_headr  r[   rI   r\   s     rA   rH   zGPT2LMHeadModel.__init__  sG     $V,yy0A0AN 	rC   Nr#  rp   rq   r=   r$  r%  r&  rr   rs   labelsr   rt   r'  r(  logits_to_keepru   c                    ||n| j                   j                  }| j                  |||||||||	||||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|
* | j                  ||
fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )ai  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)rp   r=   rq   r$  r%  r&  rr   rs   r   rt   r'  r(  r   r  r    )r  r	  rp   ro   r  r1  )rI   r2  r   rz   intslicerN  loss_functionr  r   rp   ro   r  r1  )r[   r#  rp   rq   r=   r$  r%  r&  rr   rs   rP  r   rt   r'  r(  rQ  r>   transformer_outputsro   slice_indicesr	  r  r   s                          rA   r   zGPT2LMHeadModel.forward  sA   L &1%<k$++B]B]"..+)))%'"7#9/!5# / 
 ,A.8B>SV8W~ot4]kmA}a,?@A%4%%  ;;11 	D Y!4QR!88F)-)9TGf$EvE0/??-;;*550AA
 	
rC   )NNNNNNNNNNNNNNr   )r   r   r   _tied_weights_keysrH   r   r,   r   r   r   r   r   rS  r   r   r   r   r   s   @rA   rG  rG    s    +,DE  .2(,263726042659;?*.!%)-,0#'-.!P
##d*P
 P
 ((4/	P

 ))D0P
 ((4/P
 &&-P
 ((4/P
  %||d2P
 !& 1 1D 8P
   4'P
 $;P
  $;P
 #TkP
 D[P
  ell*!P
$ 
2	2%P
 P
rC   rG  a  
        The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
    input embeddings, the classification head takes as input the input of a specified classification token index in the
    input sequence).
    c            !           e Zd ZddiZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  de
dz  de
dz  de
dz  deez  fd       Z xZS )GPT2DoubleHeadsModelrH  rI  c                     t         |   |       d|_        t        |      | _        t        j                  |j                  |j                  d      | _	        t        |      | _        | j                          y )Nr    FrL  )rG   rH   r   r  r   r   r   rM  r  rN  r   multiple_choice_headr  rO  s     rA   rH   zGPT2DoubleHeadsModel.__init__.  s\     $V,yy0A0AN$7$?! 	rC   Nr#  rp   rq   r=   r$  r%  r&  mc_token_idsrP  	mc_labelsr   rt   r'  r(  ru   c                 $   ||n| j                   j                  }| j                  |||||||||||      }|d   }| j                  |      }| j	                  ||      j                  d      }d}|
At               } ||j                  d|j                  d            |
j                  d            }d}|	|	j                  |j                        }	|dddddf   j                         }|	dddf   j                         }t               } ||j                  d|j                  d            |j                  d            }|s||f|dd z   }||f|z   }||f|z   S |S t        |||||j                  |j                  |j                        S )af  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
            1]`.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to
            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
        mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel

        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")

        >>> # Add a [CLS] to the vocabulary (we should train it also!)
        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
        >>> # Update the model embeddings with the new vocabulary size
        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))

        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
        >>> lm_logits = outputs.logits
        >>> mc_logits = outputs.mc_logits
        ```N)
rp   rq   r=   r$  r%  r&  r   rt   r'  r(  r   r#   .r    )r  r  r	  r
  rp   ro   r  )rI   r2  r   rN  r\  r   r   r   r1   r8  r(   r   r  rp   ro   r  )r[   r#  rp   rq   r=   r$  r%  r&  r]  rP  r^  r   rt   r'  r(  r>   rV  ro   	lm_logitsr
  r  loss_fctlm_lossshift_logitsshift_labelsr   s                             rA   r   zGPT2DoubleHeadsModel.forward8  s   H &1%<k$++B]B]"..+)))%'/!5# / 
 ,A.LL/	--m\JRRSUV	 ')Hy~~b)..2DEy~~VXGYZGYYy//0F$S#2#q[1<<>L!#qr'?557L')H|00\5F5Fr5JK\M^M^_aMbcG+.A!".EEF"!f,,3,?WJ'KVK)/??-;;*55
 	
rC   )NNNNNNNNNNNNNN)r   r   r   rX  rH   r   r,   r   r   r   r   r   r  r   r   r   s   @rA   rZ  rZ  #  sx    +,DE  .2(,263726042604*.-1!%)-,0#'r
##d*r
 r
 ((4/	r

 ))D0r
 ((4/r
 &&-r
 ((4/r
 &&-r
   4'r
 ##d*r
 $;r
  $;r
 #Tkr
 D[r
" 
+	+#r
 r
rC   rZ  a  
    The GPT2 Model transformer with a sequence classification head on top (linear layer).

    [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de	dz  de	dz  de	dz  de
ez  fd       Z xZS )GPT2ForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y rK  )
rG   rH   r   r  r   r   r   rM  scorer  rO  s     rA   rH   z&GPT2ForSequenceClassification.__init__  sR      ++$V,YYv}}dooEJ
 	rC   Nr#  rp   r=   r$  r%  r&  rP  r   rt   r'  r(  ru   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d	       |t        j                  ||j                  
      |f   }d}|| j                   j"                  | j$                  dk(  rd| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  dk(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	rp   r=   r$  r%  r&  r   rt   r'  r(  r   r+   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r#   )r(   r'   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r+  
regressionsingle_label_classificationmulti_label_classification)r  r	  rp   ro   r  )rI   r2  r   rh  r   pad_token_idrP   r8  r(   r,   int32r7  argmaxr4  r5  r\   r   problem_typer   r'   r   rS  r   r   r   r   r   r   rp   ro   r  )r[   r#  rp   r=   r$  r%  r&  rP  r   rt   r'  r(  r>   rV  ro   r	  r<  sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  ra  r   s                            rA   r   z%GPT2ForSequenceClassification.forward  s   D &1%<k$++B]B]"..+))%'/!5# / 
 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
rC   NNNNNNNNNNN)r   r   r   rH   r   r,   r   r   r   r   r   r   r   r   r   s   @rA   rf  rf    s     .2(,37260426*.!%)-,0#'i
##d*i
 i
 ))D0	i

 ((4/i
 &&-i
 ((4/i
   4'i
 $;i
  $;i
 #Tki
 D[i
 
1	1i
 i
rC   rf  c                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de	dz  de	dz  de	dz  de
ez  fd       Z xZS )GPT2ForTokenClassificationc                    t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n't        |d      r|j                  |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropouthidden_dropoutg?)rG   rH   r   r  r   r   r{  r|  r   rV   rx   r   rJ   
classifierr  )r[   rI   r{  r\   s      rA   rH   z#GPT2ForTokenClassification.__init__5  s      ++$V,6/0V5N5N5Z!'!:!:V-.63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	rC   Nr#  rp   r=   r$  r%  r&  rP  r   rt   r'  r(  ru   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }| j	                  |      }d}|W|j                  |j                        }t               } ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aR  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrj  r   r#   r+   )r  r	  ro   r  )rI   r2  r   rx   r}  r8  r(   r   r   r   r   ro   r  )r[   r#  rp   r=   r$  r%  r&  rP  r   rt   r'  r(  r>   rV  ro   r	  r  ra  r   s                      rA   r   z"GPT2ForTokenClassification.forwardF  s   D &1%<k$++B]B]"..+))%'/!5# / 
 ,A.]3/YYv}}-F')HFKKDOO<fkk"oNDY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
rC   rw  )r   r   r   rH   r   r,   r   r   r   r   r   r   r   r   r   s   @rA   ry  ry  3  s   "  .2(,37260426*.!%)-,0#'C
##d*C
 C
 ))D0	C

 ((4/C
 &&-C
 ((4/C
   4'C
 $;C
  $;C
 #TkC
 D[C
 
&	&C
 C
rC   ry  c                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )GPT2ForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  d      | _        | j                          y )Nr+   )
rG   rH   r   r  r   r   r   rJ   
qa_outputsr  rO  s     rA   rH   z!GPT2ForQuestionAnswering.__init__  sN      ++$V,))F$6$6: 	rC   Nr#  r=   r$  r%  r&  start_positionsend_positionsrt   r'  r(  ru   c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r*|j                  d      j                  |j                        }t        |j                               dkD  r*|j                  d      j                  |j                        }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
r*  N)r=   r$  r%  r&  rt   r'  r(  r   r    r#   r)   )ignore_indexr+   )r  start_logits
end_logitsro   r  )rI   r2  r   r  r   r   r   lenr1   r8  r(   clampr   r   ro   r  )r[   r#  r=   r$  r%  r&  r  r  rt   r'  r(  r>   r   sequence_outputr	  r  r  
total_lossignored_indexra  
start_lossend_lossr   s                          rA   r   z GPT2ForQuestionAnswering.forward  s   : &1%<k$++B]B]""))%'/!5# # 	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rC   )
NNNNNNNNNN)r   r   r   rH   r   r,   r   r   r   r   r   r   r   r   s   @rA   r  r    s     .2372604263715)-,0#'K
##d*K
 ))D0K
 ((4/	K

 &&-K
 ((4/K
 ))D0K
 ''$.K
  $;K
 #TkK
 D[K
 
-	-K
 K
rC   r  )rZ  r  rf  ry  rG  r  r   )Br   r   collections.abcr   dataclassesr   r,   r   torch.nnr   r   r    r
   r   activationsr   r   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   utilsr   r   r   utils.genericr   configuration_gpt2r!   
get_loggerr   r4  rB   ModulerE   r   r   r   r   r  r  rG  rZ  rf  ry  r  __all__r  rC   rA   <module>r     s   "  $ !   A A & 1 C C ) J 9  G # 
 , * 
		H	%%8^)BII ^)Bbii "J* J\`")) `F (v/ (v (vV 
7 7 74 {
# {
 {
| \
)? \
\
~ @
. @
@
F t
$7 t
t
n V
!4 V
 V
r V
2 V
 V
rrC   