
    謜iQ5                         d Z ddlZddlmZ 	 ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZmZmZ  ej&                  e      Zd	d
iZdZ ee       G d de
             Z G d d      Zy# e$ r dZY bw xY w)zT
SentencePiece-based tokenization class for loading from sentencepiece.model files.
    N)copyfile   )import_protobuf)PreTrainedTokenizer)INIT_TOKENIZER_DOCSTRING
AddedTokengenerate_merges)add_end_docstringsloggingrequires_backends
vocab_fileztokenizer.modelu   ▁c                       e Zd ZdZeZ fdZedefd       Z	d Z
ddee   ee   z  dedefdZdd
ee   d	z  fdZd Zd Zd Zdee   defdZddeded	z  dee   fdZ	 	 	 ddeee   z  deded	z  dedef
 fdZ xZS )SentencePieceBackendaJ  
    Base class for SentencePiece-based tokenizers that load from sentencepiece.model files.

    Inherits from [`~tokenization_utils.PreTrainedTokenizer`].

    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
    pretrained tokenizers as well as adding tokens to the vocabulary.

    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    c                    t        | d       |j                  d      | _        |j                  dd      | _        |j	                  di       | _        d|vrd|d<   t        j                  di | j
                  }|j                  | j                         | j                  syt               }|j                  j                  |j                               }|j                  j                  r0d|j                  _        |j                  |j!                                || _        | j"                  j%                         | _        | j
                  |d<   t)        | T  di | | j-                          y )	Nsentencepiecer   legacyTsp_model_kwargsbackendF )r   getr   r   popr   spmSentencePieceProcessorLoadr   
ModelProto
FromStringserialized_model_protonormalizer_specadd_dummy_prefixLoadFromSerializedProtoSerializeToStringsp_modelget_piece_sizetotal_vocab_sizesuper__init___update_trie)selfkwargs	tokenizer	model_pb2proto	__class__s        i/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_sentencepiece.pyr&   zSentencePieceBackend.__init__<   s5   $0 !**\2jj40%zz*;R@ F" /F9 ..F1E1EF	t'{{')I((33I4T4T4VWE$$559>%%611%2I2I2KL! !% < < > %)$8$8 !
 	"6"    returnc                 6    | j                   j                         S )zReturns vocab size)r"   r#   )r(   s    r.   
vocab_sizezSentencePieceBackend.vocab_sizec   s     }}++--r/   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w )zReturns vocab as a dict)ranger2   convert_ids_to_tokensupdateadded_tokens_encoder)r(   ivocabs      r.   	get_vocabzSentencePieceBackend.get_vocabh   sK    ;@;QRa++A.1RRT../ Ss   A
new_tokensspecial_tokensc           	         |syt        |       }d}|D ]  }t        |t        t        f      st	        d| dt        |       d      t        |      dk(  rDt        |t              r3|| j                  v rc|| j                  v xs |}t        |dd| |      }n |r|j                  d|j                  d	       || j                  j                         v r|j                  s8|j                  r,t        | d
d      r|j                  j                         |_        | j                   j#                  |j                        }|| j                   j%                         k  xr( | j                   j'                  |      |j                  k(  }|r|}	n|}	|dz  }|dz  }|j                  r2t        |      | j                  vr| j(                  j+                  |       || j                  |	<   |	| j                  |j                  <   | j,                  st.        j1                  d| d        | j3                          | j5                          |S )a  
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
        vocab which is why they have to be handled specifically.

        Args:
            new_tokens (`list[str]`or `list[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
                (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
                of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
                stripping and normalization of this token. This is NOT possible in `tokenizers`.
            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
            `int`: The number of tokens actually added to the vocabulary.

        Examples:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        model = BertModel.from_pretrained("google-bert/bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```r   zToken z is not a string but a . F)rstriplstrip
normalizedspecialT)rC   rB   do_lower_caser   zAdding z to the vocabulary)len
isinstancestrr   	TypeErrortype_added_tokens_encoderall_special_tokens__setstate__rB   _added_tokens_decodervaluesrC   getattrcontentlowerr"   piece_to_idr#   	IdToPiece_extra_special_tokensappendverboseloggerinfor'   _update_total_vocab_size)
r(   r;   r<   
next_index	num_addedtoken
is_specialtok_idin_base_vocabtoken_indexs
             r.   _add_tokensz SentencePieceBackend._add_tokensn   s   < Y
	 '	AEec:%67&/FtE{mST UVV5zR%%D666"d&=&==O
"5uU_Q_ist ""t5CSCS#TU2299;;==U%5%5'$Y^:_ % 3 3 5 ]]..u}}=F5577lDMM<S<STZ<[_d_l_l<l  $(a
Q	}}U43J3J!J**11%86;D&&{38CD&&u}}5||geW,>?@O'	AR 	%%'r/   Nunique_no_split_tokensc                    | j                   j                         D ]J  }|j                  | j                  j                  vs&| j                  j                  |j                         L | j                  D ]6  }|| j                  j                  vs| j                  j                  |       8 |xs g D ]6  }|| j                  j                  vs| j                  j                  |       8 y N)rM   rN   rP   tokens_trie_tokensaddrK   )r(   rb   r\   s      r.   r'   z!SentencePieceBackend._update_trie   s    //668 	4E}}D$4$4$<$<<  $$U]]3	4 ,, 	,ED,,444  $$U+	, ,1r 	,ED,,444  $$U+	,r/   c                 ~   | j                   s|j                  t        df      s!| j                  j	                  |t
              S | j                  j	                  | j                  |z   t
              }t        | j                  j	                  t        | j                                    }t        |      |k\  r||d S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
         )out_typeN)r   
startswithSPIECE_UNDERLINEr"   encoderG   	unk_tokenrE   )r(   textr)   tokensunk_token_lengths        r.   	_tokenizezSentencePieceBackend._tokenize   s     ;;doo/?.EF==''s';; %%dnnt&;c%Jt}}33C4GHI,/K;K,Kv&'(WQWWr/   c                 8    | j                   j                  |      S )z0Converts a token (str) to an id using the vocab.)r"   rR   )r(   r\   s     r.   _convert_token_to_idz)SentencePieceBackend._convert_token_to_id   s    }}((//r/   c                 <    | j                   j                  |      }|S )z=Converts an index (integer) in a token (str) using the vocab.)r"   rS   )r(   indexr\   s      r.   _convert_id_to_tokenz)SentencePieceBackend._convert_id_to_token   s    ''.r/   rp   c                 l    dj                  |      j                  t        d      j                         }|S )z:Converts a sequence of tokens (string) in a single string.r?   ri   )joinreplacerl   strip)r(   rp   
out_strings      r.   convert_tokens_to_stringz-SentencePieceBackend.convert_tokens_to_string   s,    WWV_,,-=sCIIK
r/   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       yt         j                  j                  ||r|dz   nd| j                  d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       ddd       |fS |fS # 1 sw Y   |fS xY w)a  
        Save the sentencepiece vocabulary (copy original file) to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-r?   r   wb)ospathisdirrW   errorry   vocab_files_namesabspathr   isfiler   openr"   r   write)r(   r~   r   out_vocab_fileficontent_spiece_models         r.   save_vocabularyz$SentencePieceBackend.save_vocabulary   s-    ww}}^,LL,^,<<STUo_s22QUQgQghtQuu
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   1,E++E6	token_idsskip_special_tokensclean_up_tokenization_spacesspaces_between_special_tokensc                 *    t        |   d|||d|S )z
        Decode token ids to string.

        Uses the generic decode path from PreTrainedTokenizer which works for all vocabularies,
        including custom vocabularies that override _convert_id_to_token.
        )r   r   r   r   )r%   _decode)r(   r   r   r   r   r)   r-   s         r.   r   zSentencePieceBackend._decode
  s.     w 
 3)E
 	
 	
r/   )Frd   )FNF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESr   r&   propertyintr2   r:   listrG   r   boolra   r'   rr   rt   rw   r}   tupler   r   __classcell__)r-   s   @r.   r   r   ,   s   
 *%N .C . .Nd3i$z2B&B NTX Neh N`,49t3C ,X&0
tCy S 
!c !C$J !Z_`cZd !@ %*48.3
c?
 "
 '+Tk	

 (,
 

 
r/   r   c                   V    e Zd ZdZdefdZddeeeef   e	eee
f      e	e   f   fdZy)SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 v    t        | d       ddlm}  |       | _        | j                  j	                  |       y )Nr   r   )r   )r   r   r   spr   )r(   r   r   s      r.   r&   zSentencePieceExtractor.__init__&  s)    $08(*Ur/   Nr0   c                    | j                   }t        |j                               D ci c]  }|j                  |      | }}t        |j                               D ci c]#  }|j                  |      |j	                  |      % }}t        ||      }t        |j                               D cg c]$  }|j                  |      |j	                  |      f& }}|||fS c c}w c c}w c c}w )z
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        )r   r4   GetPieceSizeid_to_piece	get_scorer	   )	r(   vocab_scoresr   rv   	vocab_idsr8   vocab_scores_dictmergesvocab_scores_lists	            r.   extractzSentencePieceExtractor.extract-  s    
 WW?DR__EV?WXeR^^E*E1X	XINrO`IabAR^^A.Q?bb ,=>KPQSQ`Q`QbKcdabnnQ/aAdd+V33 Yb es   C(C$/)C)rd   )r   r   r   r   rG   r&   r   dictr   r   floatr   r   r/   r.   r   r   !  sI    c 4E$sCx.$uSRWZGXBY[_`e[f2f,g 4r/   r   )r   r   shutilr   r   r   ImportErrorconvert_slow_tokenizerr   tokenization_pythonr   tokenization_utils_baser   r   r	   utilsr
   r   r   
get_loggerr   rW   r   rl   r   r   r   r/   r.   <module>r      s    
  4 4 
 B A 
		H	%!#45   ,-q
. q
 .q
h4 4S  
Cs   A+ +A54A5