
    謜i#                     J   d dl Z d dlZd dlmZmZ d dlmZ d dlZd dlm	Z	 d dl
mZ ddlmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZ  ej0                  e      Z e ej8                               Z ed eD              Ze G d d             Z  G d de      Z! G d de      Z"y)    N)	dataclassfield)Enum)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   4   K   | ]  }|j                     y wN)
model_type).0confs     \/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/data/datasets/squad.py	<genexpr>r   !   s     EDOOEs   c                      e Zd ZU dZ eddddj                  e      z   i      Zee	d<    edddi      Z
ee	d	<    ed
ddi      Zee	d<    ed
ddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    eddd i      Zee	d!<    ed"dd#i      Zee	d$<   y)%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r   r    intr!   r#   r%   r&   boolr'   r(   floatr*   r+   r-        r   r   r   $   s    (KdiiXcNd(deJ  (pqHc   Q
NC  rsJ  "/
c  #J
s  ")\ ]OT  %*)o p%T  (-v'rs(u  f&qrK  C
GS  f6k-lmGSmr:   r   c                       e Zd ZdZdZy)SplittraindevN)r.   r/   r0   r=   r>   r9   r:   r   r<   r<   g   s    E
Cr:   r<   c                       e Zd ZU eed<   ee   ed<   eed<   eed<   dej                  dddfdede
d	edz  deez  ded
edz  defdZd Zdeeej"                  f   fdZy)SquadDatasetargsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                    || _         || _        |j                  r
t               n	t	               | _        t        |t              r
	 t        |   }|| _
        |j                  rdnd}t        j                  j                  ||n|j                  d|j                   d|j                   j"                   d|j$                   d|       }	|	dz   }
t'        |
      5  t        j                  j)                  |	      r|j*                  st-        j,                         }t/                t1        j2                  |	d      | _        | j4                  d	   | _        | j4                  j9                  d
d       | _        | j4                  j9                  dd       | _        t>        jA                  d|	 dt-        j,                         |z
         | j:                  | j<                  dt>        jC                  d|	 d       nI|t        jD                  k(  r+| j
                  jG                  |j                        | _        n*| j
                  jI                  |j                        | _        tK        | j<                  ||j$                  |jL                  |jN                  |t        jP                  k(  |jR                  |      \  | _        | _        t-        j,                         }t1        jT                  | j6                  | j:                  | j<                  d|	       t>        jA                  d|	 dt-        j,                         |z
  dd       d d d        y # t        $ r t        d      w xY w# 1 sw Y   y xY w)Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyrB   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rQ   rF   r    r!   r#   is_trainingr-   return_dataset)rB   rP   rQ   z!Saving features into cached file z [took z.3fz s])+rA   rD   r'   r   r   	processor
isinstancer4   r<   KeyErrorrC   ospathr2   r   value	__class__r.   r    r   existsr&   timer   torchloadold_featuresrB   getrP   rQ   loggerinfowarningr>   get_dev_examplesget_train_examplesr   r!   r#   r=   r-   save)selfrA   rF   rG   rC   rD   rH   rI   version_tagcached_features_file	lock_pathstarts               r   __init__zSquadDataset.__init__r   s    	%:"/3/K/K)+QaQcdC AT{ 	"::d!ww||".IDMMdjj\9#6#6#?#?"@$BUBUAVVWXcWde 
 )72	i  -	ww~~23D<P<P		(*$)JJ/CRV$W! !% 1 1* =#0044YE $ 1 1 5 5j$ G89M8Nn]_c_h_h_jmr_r <<'4==+@NN/0D/E F& &
 599$$(NN$C$CDMM$RDM$(NN$E$Edmm$TDM.P!]]'#'#6#6#%)%:%: $ 3 LL#1	/+t| 		

!%4<<UYUbUbc(
 78L7MWUYU^U^U`chUhilTmmpqW-	 -	  A?@@A-	 -	s   	M I(M M M)c                 ,    t        | j                        S r   )lenrB   )rg   s    r   __len__zSquadDataset.__len__   s    4==!!r:   returnc                 (   | j                   |   }t        j                  |j                  t        j                        }t        j                  |j
                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }|||d}	| j                  j                  dv r|	d= | j                  j                  dv r|	j                  ||d       | j                  j                  r|	j                  d|i       | j                  rW|	j                  dt        j                   |j"                  t        j$                        | j                  j&                  z  i       | j(                  t*        j,                  k(  rrt        j                  |j.                  t        j                        }
t        j                  |j0                  t        j                        }|	j                  |
|d	       |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertru   )xlnetrv   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rB   r]   tensorrs   longrt   ru   r{   r|   r8   r}   rA   r   updater'   rD   onesshapeint64r+   rC   r<   r=   start_positionend_position)rg   ifeaturers   rt   ru   r{   r|   r}   inputsr   r   s               r   __getitem__zSquadDataset.__getitem__   s   --"LL!2!2%**E	g&<&<EJJOg&<&<EJJOLL!2!2%**E	gnnEKK@W%:%:%++N #,,
 99#PP'(99#33MM	VDEyy00>?))wIOO5;;)WZ^ZcZcZkZk)kmn99##ll7+A+ATO!LL)=)=UZZPMMMoP]^_r:   )r.   r/   r0   r   r5   listr   r<   r7   r=   r
   r6   r4   rl   ro   dictr]   Tensorr   r9   r:   r   r@   r@   l   s    
$$=!!
K $(!KK&+ $"J(J 'J Dj	J
 EkJ  $J :J JX" S%,,%6 7  r:   r@   )#rW   r\   dataclassesr   r   enumr   r]   filelockr   torch.utils.datar   models.auto.modeling_autor	   tokenization_pythonr
   utilsr   r   processors.squadr   r   r   r   
get_loggerr.   ra   r   keysMODEL_CONFIG_CLASSEStupler3   r   r<   r@   r9   r:   r   <module>r      s    
  (    $ M 6 6 t t 
		H	%E@EEGH E0DEE ?n ?n ?nDD 
u7 ur:   