
    *i$                         d Z ddlZddlmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ  ej                  e      Zdd	iZd
 Z G d de      ZdgZy)z"Tokenization classes for Splinter.    N)	Tokenizerdecodersnormalizerspre_tokenizers
processors)	WordPiece   )TokenizersBackend)logging
vocab_filez	vocab.txtc                     t        j                         }t        | dd      5 }|j                         }d d d        t	              D ]  \  }}|j                  d      }|||<    |S # 1 sw Y   4xY w)Nrzutf-8)encoding
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextokens         n/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/splinter/tokenization_splinter.py
load_vocabr      sw    ##%E	j#	0 $F!!#$!&) uT"e L$ $s   A''A0c                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 	 	 	 dde	e
e	ef   z  dz  dede	de	d	e	d
e	de	de	dededz  f fdZed        Zd Z xZS )SplinterTokenizera  
    Construct a Splinter tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to a vocabulary file.
        tokenizer_file (`str`, *optional*):
            Path to a tokenizers JSON file containing the serialization of a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values.
        question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
            The token used for constructing question representations.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase`.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
    	input_idsattention_maskNr   do_lower_case	unk_token	sep_token	pad_token	cls_token
mask_tokenquestion_tokentokenize_chinese_charsstrip_accentsc                    ||nEt        |      dt        |      dt        |      dt        |      dt        |      dt        |      dddi| _        t        t        | j                  t        |      	            | _        t        j                  d
|	|
|      | j                  _        t        j                         | j                  _
        t        j                  d      | j                  _        t        | 8  d||||||||	|
d	| || _        |	| _        |
| _        || _        | j$                  | j&                  vr| j)                  | j$                  gd
       | j+                          y )Nr         r	         .   )r#   T)
clean_texthandle_chinese_charsr*   	lowercasez##)prefix)	r#   r$   r%   r&   r'   r(   r"   r)   r*   )special_tokens )str_vocabr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   decodersuper__init__r"   r)   r*   r(   all_special_tokens
add_tokensupdate_post_processor)selfr   r"   r#   r$   r%   r&   r'   r(   r)   r*   kwargs	__class__s               r   rA   zSplinterTokenizer.__init__Q   sZ        IIIIJN#QQ 	 $IdkkS^$TU%0%?%?!7'#	&
" )7(G(G(I%"*"4"4D"A 	
!)'#9'	
 	
 +&<#*,d&=&==OOT001$OG""$    c                 8    | j                  | j                        S )N)convert_tokens_to_idsr(   )rE   s    r   question_token_idz#SplinterTokenizer.question_token_id   s    ))$*=*=>>rH   c           
         | j                   }| j                  }| j                  }d}| j                  }| j                  }| j
                  }| j                  d      }||y | j                  dk(  r| d| d| d| d| d
}	n| d| d| d| d| d
}	t        j                  | d| d|	||f||f||f||fg      | j                  _        y )	Nr0   rightz:0 $A:0  z:0 $B:1 z:1z:0)singlepairr6   )r&   r$   r(   cls_token_idsep_token_idrK   rJ   padding_sider   TemplateProcessingr:   post_processor)
rE   clssepquestiondotrQ   rR   rK   dot_token_idrP   s
             r   rD   z'SplinterTokenizer.update_post_processor   s   nnnn&&(((( 2211#6;#+'U(8*AcU!C5RHDU(3%xz3%qRHD)3)F)FU(3%r*l#l#,-l#		*
&rH   )
NTz[UNK]z[SEP]z[PAD]z[CLS]z[MASK]z
[QUESTION]TN)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr8   dictintboolrA   propertyrK   rD   __classcell__)rG   s   @r   r   r   )   s    !F *$&67E .2"    "*'+%):%T#s(^#d*:% :% 	:%
 :% :% :% :% :% !%:% d{:%x ? ?
rH   r   )r^   r   
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   utilsr   
get_loggerr[   loggerr_   r   r   __all__r7   rH   r   <module>ro      sZ    )  S S ' >  
		H	%!;/ C
) C
L 
rH   