
    謜i                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4  e4jj                  e6      Z7dZ8dZ9dZ:dZ;dZ<e*dz  Z*ee e!e"dZ=e8e;dZ> e3e*       G d de-             Z?e?Z@y)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainercached_file   )convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)r   r   	WordLevel	WordPiece)tokenizer_file
vocab_filec            )       
    e Zd ZdZeZdZdZedLd       Z	 fdZ
edefd       Zedefd       ZdMd	ed
edz  dee   fdZd Zed        Zed        Zej*                  d        Zej*                  d        Zd Zedefd       Zdeeef   fdZedeeef   fd       Zedeeef   fd       Zedeeef   fd       ZeZeZ deeef   fdZ!defdZ"defdZ#ede$fd       Z%ede&fd       Z'	 	 	 	 	 	 	 dNde(dedz  dedz  d ed!ed"ed#ed$edeeee)f   e*e(   f   fd%Z+d&edefd'Z,d(ededz  fd)Z-dLd*e*eez     defd+Z.dLd,edefd-Z/dLd.ee*e   z  d/edee*e   z  fd0Z0dOd1ed,edz  d2ede*e   fd3Z1d4e2d5e3d6ed7ed8edz  d9edz  fd:Z4dde2jj                  e3jl                  dd;ddddddddddddfd1e7e8z  e*e7   z  e*e8   z  d<e7e8z  e*e7   z  e*e8   z  dz  d2ed4e2d5e3d6edz  d7ed=ed8edz  d9edz  d>edz  dedz  dedz  d ed!ed"ed#ed$ed?edz  de9f(d@Z:dAe*e   defdBZ;	 	 dPdCee*e   z  d/edDedz  defdEZ<	 	 dQd	ee=j|                  z  dFeedGf   dHedz  d
edz  deedGf   f
dIZ?	 	 	 dRdJZ@e	 	 	 	 	 	 	 dSdK       ZA xZBS )TTokenizersBackendaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    NFc                 4   t        |      }|j                  dd      }|Qt        j                  j	                  |      r2| t
        u sd| j                  vs|rt        j                  |      |d<   |S |Lt        j                  j	                  |      r,t        j                  |      j                  }t        |d      5 }t        j                  |      }ddd       j                  di       j                  dd      }| j                  +t        |t               rt!        t#        t$        |            }n| j                  j&                  d	k(  r4|rt        |d
   t         t$        f      r|D 	cg c]  }	t%        |	       }}	n| j                  j&                  dk(  rt)        |      D 
ci c]  \  }
}||

 }}
}nu| j                  j&                  dk(  s| j                  j&                  dk(  rCt        |t               r3t)        |      D 
ci c]  \  }
}t        |t               r|d
   n||
 }}
}||d<   t+        | dd      }d|j                  di       v r`|r^|j&                  dk(  rO|d   d   }|D cg c]7  }t        |t,              rt%        |j/                  d            n
t%        |      9 }}||d<   |||d<   |S |j                  d      }|j                  d      }|j                  d      }|j                  d      }t        |t,              rY|j1                  d      rHt        j                  j	                  |      r)ddlm}  ||      j7                  |      \  |d<   |d<   |S t        |t,              rt        j                  j	                  |      r|j1                  d      r	 ddlm}   ||      j:                  | j                  fi |}	 ddlm} |j                  | j&                        }|t?        |d      r |j@                  d*i |}t?        | d      r | jH                  d*i |}|S |9t        |t,              r)t        j                  j	                  |      r
||d<   |d   }|9t        |t,              r)t        j                  j	                  |      r
||d<   |d   }|| j                  | j                  j&                  dk(  rwt        |t               rgd%tL        tN           d&t         t,           ffd'g d(}tQ               }|D ]"  }||v s|jS                   ||   g             $ tU        ||)      }||d<   |S # 1 sw Y   {xY wc c}	w c c}}
w c c}}
w c c}w # tB        $ r1}tD        jG                  d| j&                   d| d       Y d}~d}~ww xY w# tB        $ r^}tD        jG                  d| d | d!       dd"lm%}  |||j                  d#      $      j7                  |      \  |d<   |d<   Y d}~|S d}~ww xY w)+zs
        Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
        models, tekken.json, vocab/merges).
        r$   N__init__tokenizer_objectutf-8encodingmodelvocabr   r   r"   r   r#   merges post_processorr%   merges_fileztekken.jsonr   )MistralConverter)r%   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modelz+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r%   r;   valuesreturnc                     g }| D ]M  }|t        |t        t        f      r|j                   |             4|j	                  t        |             O |S N)
isinstancelisttupleextendappendstr)r<   	collectedval_iter_special_tokenss      f/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_tokenizers.pyrH   zHTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens   s[    ')	! 3C{ !#e}5!(()=c)BC!((S23 !     )		pad_token	unk_token	bos_token	eos_token	sep_token	cls_token
mask_tokenadditional_special_tokensr;   )skip_tokens )+dictpopospathisfiler'   __dict__TokenizerFast	from_filer2   openjsonloadgetr.   r@   rA   maprB   __name__	enumerategetattrrE   splitendswithconvert_slow_tokenizerr4   extract_vocab_merges_from_modelr6   extractr7   hasattrr8   	Exceptionloggerwarningr9   r:   r   r   setupdater   )clstrust_remote_codekwargslocal_kwargsfast_tokenizer_file	processortokenizer_handletokenizer_jsonr/   itemitoken
model_typer0   merger%   r3   r4   r6   r7   converter_classer:   special_tokens_keysrS   keyrH   s                             @rI   convert_to_native_formatz*TokenizersBackend.convert_to_native_formatd   s    F|*../?F  +23))Zs||-KO`/</F/FGZ/[L+, ,@S1T%//0CDSSI)G< =@P!%+;!<="&&w377FEyy eT* UE!23E##y0Za4-@5:;TU4[;E;##{22;E2BChaCC##u,		0B0Bk0QeT*_hin_opS[STV[E4)@U1XeQNpEp$)L! gt4J>--gr::
zObObfkOk'0:kqrbgZs5K%C 01QVW\Q]]rr)/X&$1:-.!%%l3
"&&}5  )!!(+ j#&:+>+>}+MRTRYRYR`R`akRl@<L%=--j9 :L!<#9   j#&277>>*+E*J]J]^fJg>JI5jAII#))dWcd	O&=&A&A#,,&OO&2wPb7c'G'G'G'W,'W 3 89#=3#=#=#M#ML   =Z
C8RWW^^J=W$.L! )E>jc:rww~~k?Z%0L"!(+F >cii3		8J8Je8SXbchjnXo	!Xc] 	!tCy 	!
# %(EK* R,&&&';\#=N<O'PQR %UDF%+L"a= = <C q sB ! NNFs||nT\]^\_  `O  P   	>A*Mqrsqt u: : F@Q)@P@PQg@hA11*= >W%|H'=  	>sh   U0U,U""U(5<U./)V0 AU3 V0 U3	V-<&V("V0 (V--V0 0	X9AXXc           	      @   |j                  dd       }|j                  dd       }|j                  dd       }|j                  di       }|j                  dd      }|j                  d      }|j                  d      }	|j                  d	      }
d }|t        j                  |      }n|6t        j
                  j                  |      rt        j                  |      }nn|{t        |j                  d
d      |fi |}t        |      }|d   d   }|d   }|d   }t        ||      \  }}|j                  |       t        |      dkD  r|j                  |       n| j                  |	|
Ot        |	t               r|	n#t#        |	      D ci c]  \  }\  }}|| c}}}}t        t%        ||
dd             }nt        |	t               rt        t%        |	g dd             }nit        |	t&              rY|	rWt        |	d   t(        t&        f      r>t        t+        |	|j                  dd                  }n| j                  t-        d      |2|0| j                  $|j/                  dd       |j/                  dd       ||| _        | j                  t-        d      | j                  j0                  }|q | j                  j2                  d3i | |j/                  d|d          |j/                  d|d          |j/                  d|d          |j/                  d|d           n| j                  j5                          | j                  j6                  }| | j                  j8                  d3i | |j/                  d!|d!          |j/                  d"|d#          |j/                  d$|d          |j/                  d|d%          |j/                  d&|d&          d'|vrd(|d'<   d)|v xs d*|v }|j                  d)d      | _        |j                  d*d      | _        |j                  d+d       x}r|| j                  _        |xs | j                  j>                  d u | _         tC        $|   d3i | ||| _#        || _$        | jJ                  | j                  _&        | jN                  D ch c]  }tQ        tS        |             }}tU        |jW                         d, -      D cg c]  \  }}tQ        tS        |            |vr| }}}t'        | jX                  j[                               |D cg c]  }t]        |       c}z   }| j^                  ja                         D ])  }|t]        |      |vs||vs|jc                  |       + | jd                  D ]&  }t]        |      |vs||vs|jc                  |       ( t        |      dkD  rg } | j^                  ja                         D !cg c]  }!|!st]        |!       }"}!|D ]a  }t        |t\              rtg        |d.      }n0t        |tf              r |jh                  st]        |      |"v rd|_4        | jc                  |       c | r| jk                  |        	 | j                  jm                         }#|#d/kD  rtq        | j                  d0d       k|j                  dd         | jr                  | j                  | jt                  j                  d
d       f| jt                  |j                  d1      d2|| _        | j@                  xs | j                  j>                  d u | _         | j@                  r| jw                          y y c c}}}w c c}w c c}}w c c}w c c}!w # tn        $ r d}#Y w xY w)4Nr*   	gguf_filer$   added_tokens_decoderadd_prefix_spaceFr%   r/   r0   name_or_path configr{   	tokenizertokenizer_configr   T)r/   r0   fuse_unkdropoutunk_id)r/   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.rM   z<s>rN   z</s>z3The backend tokenizer is not correctly initialized.
max_lengthtruncation_side	directionstridetruncation_strategystrategyrK   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenr2   c                     | d   S Nr   rT   )xs    rI   <lambda>z,TokenizersBackend.__init__.<locals>.<lambda>N  s    STUVSW rJ   r   )speciali pre_tokenizerfix_mistral_regex)init_kwargsr   rT   )<rV   r`   copydeepcopyrW   rX   rY   r[   r\   r   r   r   ro   len
_tokenizerr@   rU   rc   r   rA   rB   r   
ValueError
setdefault
truncationenable_truncationno_truncationpaddingenable_padding_add_bos_token_add_eos_tokenr2   _should_update_post_processorsuperr)   r%   r   split_special_tokensencode_special_tokensr   hashreprsorteditemsadded_tokens_encoderkeysrE   _special_tokens_mapr<   rD   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorrd   _patch_mistral_regexr   update_post_processor)%selfargsrr   r*   r   rt   r   r   r%   r/   r0   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargsry   w_
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsr2   rz   added_tokens_decoder_hashindextokens_to_addencoderspecial_token_valuetokenstall_named_tokens
vocab_size	__class__s%                                       rI   r)   zTokenizersBackend.__init__   sP   !::&8$?JJ{D1	$jj)94@%zz*@"E!::&8%@ZZ-


7#H%'!]]+;<N ,@S1T*445HIN"#FJJ~r$BIXQWXI-i8J%h/=L'4N)*<=0F|Uc0d-N-MM*+$%)/0__$):!&0&=UZcdiZjCkCkYQPVQRTUAqDCk
!.sF]ako/p!qE4(!.srTXbf/g!hE4(Uz%(UTXM7Z!.wU6::V^`aKb/c!d__$r  &+;+CH_k51k62%,DO??"RSSoo00"-DOO--<<lK,EF/[1IJhH(=>3[5LMOO))+??***DOO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS F" ,F9%4%>%[/U[B["$jj%@$jj%@#ZZ(8$??>?-;DOO*-G-q4??KiKimqKq*"6"!(DO 0040I0I-DHD]D]$^5T$u+%6$^!$^ !'';'A'A'C X
uDK (AA 
 

 t005578Ta;b5CJ;bb $(#;#;#B#B#D 	:"*&'w6;NVc;c$$%89		: // 	,E5z(U--G$$U+	, }!F040H0H0O0O0QW1UVAWW& %eS)&ud;Ez2 ==SZ;K-K(,e$% '	779J
 74??OT#R#^JJ{D)7d77  $$^T: !,,"(**-@"A	
 DO ..X$//2P2PTX2X 	* --&&( .q Dl@ %_

 <c"  X  # 	J	s6   ]33]:/"]?:^8^
 ^
	^ ^^r=   c                      y)NTrT   r   s    rI   is_fastzTokenizersBackend.is_fast  s    rJ   c                     d| j                   v r`| j                   d   j                  d      rBt        | d      r5| j                  r)t        j
                  j                  | j                        S yy)z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r%   r5   FT)vocab_files_namesrf   rj   r%   rW   rX   rY   r   s    rI   can_save_slow_tokenizerz)TokenizersBackend.can_save_slow_tokenizer  sX     4111d6L6L\6Z6c6cdl6mt\*tww~~doo66rJ   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rt        | j                  |       |fS )NzVocabulary path (z) should be a directory-r   r%   )
rW   rX   isdirrl   errorjoinVOCAB_FILES_NAMESabspathr%   r   )r   r   r   out_vocab_files       rI   save_vocabularyz!TokenizersBackend.save_vocabulary  s    ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  rJ   c                 J   | j                   }| j                  }|| j                  rt        d      | j                  }| j
                  }|| j                  rd| _        y| j                  r|dz   nd d| j                  rd|z   dz   nd }| | j                  rd|z   d	z   nd d
| j                  rd|z   d	z   nd }g }| j                  r|j                  ||f       | j                  r|j                  ||f       t        j                  |||      | j                  _        y)ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        Nz)add_bos_token = True but bos_token = NoneFz:0 r   z$A:0r1   z:0z:1z $B:1)singlepairspecial_tokens)rM   bos_token_idr   r   rN   eos_token_idr   rD   r	   TemplateProcessingr   r2   )r   bosr   eosr   r   r   r   s           rI   r   z'TokenizersBackend.update_post_processor  s=    nn((;4--HIInn(( ;4--!&D%)%7%7S5[R@[_[m[mcCiRVFVsuDvw0B0B39t+K5gkgygyRUX[R[^bRb  @B  QC  D!!3"56!!3"56)3)F)F^*
&rJ   c                     t        | dd      S )Nr   Frd   r   s    rI   r   zTokenizersBackend.add_eos_token      t-u55rJ   c                     t        | dd      S )Nr   Fr   r   s    rI   r   zTokenizersBackend.add_bos_token  r   rJ   c                 R    t         j                  | d|       | j                          y )Nr   object__setattr__r   r   values     rI   r   zTokenizersBackend.add_eos_token  !    4!159""$rJ   c                 R    t         j                  | d|       | j                          y )Nr   r   r   s     rI   r   zTokenizersBackend.add_bos_token  r   rJ   c                 @   g }| j                   j                         D ]U  }|t        |t              r|j	                  |       (t        |t
              s9|j	                  t        |dd             W | j                  D ]R  }t        |t              r|j	                  |       %t        |t
              s6|j	                  t        |dd             T |r| j                  |d       t        | dd      s| j                  j                  | j                          yy)a[  
        Post-initialization hook that runs after the tokenizer is fully set up.
        This is called by from_pretrained() after loading the tokenizer, which allows
        us to add any special tokens that may have been passed as AddedToken objects.

        Child classes should call super()._post_init() if they override this method.
        NTF)r   
normalized)r   r   )r   r<   r@   r   rD   rE   r   r   rd   r   r2   r   )r   r   token_valuerz   s       rI   
_post_initzTokenizersBackend._post_init  s    33::< 	^K"+z2$$[1K-$$ZTV[%\]	^ // 	XE%,$$U+E3'$$ZtPU%VW		X OOM$O?48$?4??CaCaCi&&( DjrJ   c                 :    | j                   j                  d      S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensr   r   r   s    rI   r   zTokenizersBackend.vocab_size  s    
 ---FFrJ   c                 :    | j                   j                  d      S )NTr  )r   	get_vocabr   s    rI   r  zTokenizersBackend.get_vocab  s    ((4(@@rJ   c                 "    | j                         S r?   )r  r   s    rI   r/   zTokenizersBackend.vocab  s    ~~rJ   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                     | d   S r   rT   rx   s    rI   r   z8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>      dhijdk rJ   r   r   r   r   contentr   vks      rI   r   z&TokenizersBackend.added_tokens_encoder  s;     *00I0I0O0O0QWk)lmA		1mmm   Ac                 6    | j                   j                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )r   get_added_tokens_decoderr   s    rI   r   z&TokenizersBackend.added_tokens_decoder
  s     7799rJ   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                     | d   S r   rT   r  s    rI   r   z3TokenizersBackend.get_added_vocab.<locals>.<lambda>   r  rJ   r   r  r  s      rI   get_added_vocabz!TokenizersBackend.get_added_vocab  s;     *00I0I0O0O0QWk)lmA		1mmmr  c                      y)zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        TrT   r   s    rI   __bool__zTokenizersBackend.__bool__"  s     rJ   c                 :    | j                   j                  d      S )zD
        Size of the full vocabulary with the added tokens.
        Tr  r	  r   s    rI   __len__zTokenizersBackend.__len__(  s     ---EErJ   c                     | j                   S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r   r   s    rI   backend_tokenizerz#TokenizersBackend.backend_tokenizer.  s    
 rJ   c                 .    | j                   j                  S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r   decoderr   s    rI   r#  zTokenizersBackend.decoder5  s    
 &&&rJ   Tr-   return_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 J   |d| j                   v }|d| j                   v }|r|j                  |g|j                  z   }	n|g}	t        t              }
|	D ]  }|
d   j	                  |j
                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |s|
d   j	                  t        |j
                                |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   rA   rD   idstype_idsr-  r/  offsetsr   )r   r-   r$  r%  r&  r'  r(  r)  r*  	encodingsencoding_dictr~   s               rI   _convert_encodingz#TokenizersBackend._convert_encoding<  s$   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D) 	;A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyAh'..s155z:	; i''rJ   rz   c                 X    | j                   j                  |      }|| j                  S |S r?   )r   token_to_idunk_token_id)r   rz   r   s      rI   #_convert_token_to_id_with_added_vocz5TokenizersBackend._convert_token_to_id_with_added_vock  s,    ++E2=$$$rJ   r   c                 J    | j                   j                  t        |            S r?   )r   id_to_tokenint)r   r   s     rI   _convert_id_to_tokenz&TokenizersBackend._convert_id_to_tokenq  s    **3u:66rJ   
new_tokensc                 r    |r| j                   j                  |      S | j                   j                  |      S r?   )r   add_special_tokensr   )r   rA  r   s      rI   _add_tokenszTokenizersBackend._add_tokenst  s/    ??55jAA))*55rJ   r   c                 8    | j                   j                  |      S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r   num_special_tokens_to_add)r   r   s     rI   rF  z+TokenizersBackend.num_special_tokens_to_addz  s    & 88>>rJ   r3  skip_special_tokensc                 $   t        |t              r| j                  j                  |      S g }|rt	        | j
                        n	t	               }|D ]<  }t        |      }||v r|j                  | j                  j                  |             > |S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )r@   r?  r   r>  rn   all_special_idsrD   )r   r3  rG  r   ids_to_skipr   s         rI   convert_ids_to_tokensz'TokenizersBackend.convert_ids_to_tokens  s     c3??..s333Fc$../CE 	>EJE#MM$//55e<=		>
 rJ   textrC  c                 J     | j                   d|||d|j                         S )N)rL  	text_pairrC  rT   )_encode_plusr   )r   rL  r   rC  rr   s        rI   tokenizezTokenizersBackend.tokenize  s,     t  lddOaleklssuurJ   padding_strategyr   r   r   r   r   c                    | j                   j                  }| j                   j                  }|t        j                  k(  r||| j                   j                          na|||j                  | j                  d}	|d}
n |	D ci c]  }||j                  |d       }
}|
|	k7  r | j                   j                  di |	 |t        j                  k(  r|| j                   j                          yy|t        j                  k(  r|nd}|||n| j                  | j                  | j                   | j"                  |d}	||	k7  r | j                   j$                  di |	 yyc c}w )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r   r   r   r   )r   r   pad_idrK   r   r   rT   )r   r   r   r   DO_NOT_TRUNCATEr   r   r   r`   r   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idrK   r   r   )r   rQ  r   r   r   r   r   r   r   targetcurrentr  r   s                rI   set_truncation_and_paddingz,TokenizersBackend.set_truncation_and_padding  sZ   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG11kooa66GG& 111;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F 6!...88 "% Hs   Er   rN  is_split_into_wordsreturn_tensorsr   c                    d } ||      st        d      | ||      st        d      |r6t        |t        t        f      xr |xr t        |d   t        t        f      }nt        |t        t        f      }|rrt        |t              rt        d      |;t        |      t        |      k7  r$t        dt        |       dt        |       d      |t        t        ||            n|}n
|r||fgn|g}t        |t        t        f      st        dt        |       d	      | j                  |||||	|

       || j                  }| j                  j                  |k7  r|| j                  _        | j                  j                  |||      }|D cg c]  }| j                  ||||||||       }}i }|d   d   D ]'  }|D cg c]  \  }}||   D ]  }|  } }}}| ||<   ) |D cg c]  \  }}|D ]  }|  }!}}}|r2g }"t        |      D ]  \  }#\  }$}|"|#gt        |$d         z  z  }" |"|d<   |d   D ]  }%| j!                  |%||        t#        ||!|      }&|sb|`|s^t#        |&j%                         D 'ci c].  \  }}'|t        |'      dkD  rt        |'d   t              r|'d   n|'0 c}'}|&j&                        }&|&S c c}w c c}}}w c c}}}w c c}'}w )Nc                    t        | t              ryt        | t        t        f      rt	        |       dk(  ryt        | d   t              ryt        | d   t        t        f      rtt	        | d         dk(  st        | d   d   t              ryt        | d   d   t        t        f      r/t	        | d   d         dk(  xs t        | d   d   d   t              S yyy)NTr   F)r@   rE   rA   rB   r   )r   s    rI   _is_valid_text_inputz<TokenizersBackend._encode_plus.<locals>._is_valid_text_input  s    !S!Ae}-q6Q;!c*!tUm41Q4yA~AaDGS)A##AaDGdE];"1Q47|q0OJqtAwqz34OO$ rJ   ztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).r   zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))rQ  r   r   r   r   r   )rC  is_pretokenized)r-   r$  r%  r&  r'  r(  r)  r*  r.  overflow_to_sample_mapping)tensor_type)r   r@   rA   rB   rE   	TypeErrorr   ziptyper[  r   r   r   encode_batchr8  rc   &_eventual_warn_about_too_long_sequencer   r   r6  )(r   rL  rN  rC  rQ  r   r   r   r\  r   r   r]  r$  r%  r&  r'  r(  r)  r*  r   rr   r`  
is_batchedbatch_text_or_text_pairsr6  r-   tokens_and_encodingssanitized_tokensr   rx   r   r~   stacksanitized_encodingsrd  ry   toksr.  batched_outputr   s(                                           rI   rO  zTokenizersBackend._encode_plus  s   0	( $D)W 
  )=i)HW  #D4-8hThjQUVWQX[_afZgFhJ#D4-8J)S)  $Tc)n)D .s4yk :I'q*  FOEZtCi,@'A`d$ ?Hy(9':dV$ 2UDMBLTRjMkLllmn  	''- 3!1% 	( 	
  '#'#<#< ??004HH4HDOO1 OO00$1/ 1 
	$ & 
  ""!&;&;*C+E'=+ # 	 
  
 '*1- 	*C&:NN74DINqQNQNEN$)S!	* 1ESSWQdSqSqSS %)+& )*> ? K9D!*qcC[8I4J.JJ*K=W9:)+6 	XI77	:wW	X ''79LZhi n4=V* '5&:&:&<"U c%j1nE!Hd9S%(Y^^ ((N W 
" OS"s    KK
3K3K"
r   c                     | j                   j                  %| j                   j                  j                  |      S dj                  |      S )Nr1   )r!  r#  decoder   )r   r   s     rI   convert_tokens_to_stringz*TokenizersBackend.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
rJ   	token_idsclean_up_tokenization_spacesc                    |j                  dd        t        |t              r|g}t        |t              r|d   }| j                  j                  ||      }||n| j                  }|rt        | d      r(t        | j                        r| j                  |      }|S |j                  dd      j                  dd      j                  d	d
      j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      }|S )Nuse_source_tokenizerr.  )rG  clean_up_tokenizationz .ra  z ??z !!z ,,z ' 'z n'tzn'tz 'mz'mz 'sz'sz 'vez'vez 'rez're)rV   r@   r?  rU   r   rt  rw  rj   callablerz  replace)r   rv  rG  rw  rr   rL  s         rI   _decodezTokenizersBackend._decode  s)    	

)40i%"Ii&!+.I%%iEX%Y ,7 )22 	%
 (t45(4C]C]:^11$7   LLs+WT3'WT3'WT3'WUC(WVU+WUD)WUD)WVU+WVU+  rJ   
file_names.legacy_formatc                     t        |      }t        j                  j                  ||r|dz   ndt        z         }| j
                  j                  |       ||fz   }|S )Nr   r   )rE   rW   rX   r   TOKENIZER_FILEr!  save)r   r   r  r  r   r$   s         rI   _save_pretrainedz"TokenizersBackend._save_pretrained  s]     ^,o_s22Q__
 	##N3>"33
rJ   c           	      `
   t        j                  | j                  j                               }|j	                  d      }|j	                  d      }	d}
|d   d   dk(  ri |d   d<   g |d   d<   np|d   d   d	k(  r=|d   d
   ]|d   d
   }|d   d   |   d   }
|	|
|v r||
   }
d|d   d
<   |
dgg|d   d<   n(|d   d   dv r	i |d   d<   nt        d|d   d    d      |"d|d   v r|d   d   |v r||d   d      |d   d<   t        j                  t        j                  |            }g }|D ]b  }|j	                  dd      }|j	                  dd      }|d   d   d	k7  r|s5||d   |v r||d      |d<   |j                  t        d(i |       d ||j                  |       |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   d	k(  r|
|
|d<   |d   V|d   d   dk(  s*|d   d   dk(  r@d|d   v r9t        d |d   d   D              r!t        j                  j                         |d<   t         |d   d      } |d(||d|}|j#                  |||       |	&t        j                  |j                               }d|	v r|	d   D ]  }|	d   |   d   }||D cg c]  }|j%                  ||       }}||	d   |   d<   |D ]   }|j'                  |      }|t        d        |D cg c]  }|j'                  |       c}|	d   |   d!<    d"D ]?  }||	v s|	|   \  }}|	||v r||   }|j'                  |      }|t        d       ||g|	|<   A |	|d<   t        j                  t        j                  |            }| j(                  j+                         }t,        j.                  D ]  }t1        | |      t1        | |      }|	||v r||   }| j2                  j%                  |d      }t5        |t              r=t        ||j6                  |j8                  |j:                  |j<                  d#$      ||<   |||<    | j>                  r| j>                  j+                         ng }||j                  |       tA        |      dkD  r||d%<   ||d&<   	  | jB                  d(i |S c c}w c c}w # tD        $ rE}d'tG        |      v r2|j	                  d&d        | jB                  d(i |}||_        |cY d}~S  d}~ww xY w))uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokensr2   Nr.   rh  r   r/   r0   r   r   r   g        )r"   r#   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.rL   r   idr  continuing_subword_prefixend_of_word_suffixr   	ByteLevelSequencepretokenizersc              3   ,   K   | ]  }|d    dk(    yw)rh  r  NrT   ).0pretokenizers     rI   	<genexpr>z<TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>;  s"      $ !(K7s   initial_alphabet)r   r   )r   trainerr   r   zQAttempted to set a token in the post processor that does not exist in the mappingr3  )rp   sepT)single_wordlstriprstripr  r   r;   r*   z7multiple values for keyword argument 'tokenizer_object'rT   )$r^   loadsr   to_strrV   r   r[   from_strdumpsrD   r   rC   anypre_tokenizers_fastr  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorr`   r:  r   r   r   SPECIAL_TOKENS_ATTRIBUTESrd   r   r@   r  r  r  r  r;   r   r   rf  rE   )r   text_iteratorr   r   new_special_tokensspecial_tokens_maprr   rw   r  r2   rL   r   r   r   added_tokenr   r   trainer_classr  trained_tokenizer_jsonr   r   rz   token_idspecial_tokenspecial_token_fullr;   r~   new_tokenizers                                rI   train_new_from_iteratorz)TokenizersBackend.train_new_from_iterator  s   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1iCU6U 29 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EnU\F]^iFj3kN7#K0!**4::n+EF	 ' 	=K!ooi6Gd+Ag&v.);G!-+i2HL^2^);K	<R)SI&!!*";{";<	= )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+F;/*6/7;F!/26:jH#~o'FF (6(G(X 
 .A-J-J-S-S-U)*01H1PQ_:n_X^_%%mFG%T%%)ZZ	0@0@0B%C">1)*:; vC+,<=cB8LF)5TZ![5"4"8"8"F![![FLN#34S9(C!' #,#8#8#?#+", s#  ouCuejIDYDYZ_D`CuN#34S9%@v "0 
F N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1
F 8F"#34%..tzz:P/QRI!!&&(,FF 	2EtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*=$.%$6$B$B188188#5#@#@ $%F5M %2F5M%	2* DHC\C\t88==?bd) ''(:;#$q(-AF)* &/!"	!4>>+F++{ "\ Dvj  	HCPQFR 

-t4 . 8 8+4($$ 	s0   SSS 	T-(9T(!T-'T((T-c
           
         ddl ddlm ddlm} ddlm} dt        dt        ffd}t               rd	}|*|s|s% ||      r ||d
|||dd|      }d}|t        |d      5 }t        j                  |      }ddd       j                  d      }|j                  d      }|r-|j                  |      |j                  d      k  r
|r/|-|dvr)|S |r%|j                  |      |j                  d      kD  r|S d	}|s|s^ ||      rU|rd|v rt        |d|d          |	5t!        |dd      s(t        |dd       t"        j%                  d| d       |S |	d	u st!        |dd      rt        |dd	       ddl}|j(                  j+                  |j-                  d      d      }|j.                  j0                  }t3        ||j(                  j4                        r||j.                  j0                  d<   |S t3        ||j(                  j6                        r|j(                  j9                  dd      }|j(                  j5                  ||g      |j.                  _        |S # 1 sw Y   xY w)af  
        Patches mistral related tokenizers with incorrect regex if detected
            1) Local file with an associated config saved next to it
                >> Model type one of the mistral models (on older versions)
            2) Remote models on the hub from official mistral models
                >> Tags including `base_model:.*mistralai`
        r   N)
model_info)versionr   model_idr=   c                      |       }|j                   ,j                  ddj                  |j                               ryy)Nzbase_model:.*mistralair   TF)tagssearchr   )r  r.   r  res     rI   is_base_mistralz?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistral  s9    x(Ezz%995rwwuzz7JKrJ   Tzconfig.jsonF)	cache_dirrz   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr+   r,   transformers_versionr{   z4.57.2)mistralmistral3voxtral	ministralpixtralz4.57.3r   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)r  huggingface_hubr  	packagingr  transformers.utils.hubr   rE   boolr   r]   r^   r_   r`   parsesetattrrd   rl   rm   r   pre_tokenizersSplitRegexr!  r   r@   r  	Metaspacer  )rp   r   pretrained_model_name_or_pathrz   r  r  r  is_localr   r   rr   r  r   r  _config_filemistral_config_detectedf_configr  transformers_model_typer   split_pretokenizercurrent_pretokenizerr  r  s                          @@rI   r   z&TokenizersBackend._patch_mistral_regex  s   * 	.%6	c 	d 	 H(4X/:W*X&-#!16;8=)	L ',#',9 +Q"iilG+'.{{3I'J$*1++l*C'
 (GMM:N,OSZS`S`aiSj,j 3?3   )()gmm<P.QT[TaTabjTk.k$$*.'&xOLi<j#6+#EI':KH[<\] %,WYH[]b5cI':EBNN>?\>] ^e eH ? '$.')EXZ_2`I':DA%)3)B)B)H)H * 0 0 s! ",	 *I *& ,5+F+F+T+T(!"6
8Q8Q8Z8Z[GY	33AA!D"  &&:J<U<U<_<_`3=3L3L3V3V16% 4W 40
 EOD]D]DfDf 2 4E	33A O+ +s   *I''I1)Fr?   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)Crb   
__module____qualname____doc__r   r   r.   r   classmethodr   r)   propertyr  r   r   rE   rB   r   r   r   r   setterr  r?  r   rU   r  r/   r   r   r   _added_tokens_encoder_added_tokens_decoderr  r  r  r[   r!  DecoderFastr#  EncodingFastr   rA   r8  r<  r@  rD  rF  rK  rP  r   r   r[  rU  rT  r   r   r   rO  ru  r  rW   PathLiker  r  r   __classcell__)r   s   @rI   r'   r'   R   s   
 *EJC CJZ)x      !c !C$J !Z_`cZd !
: 6 6 6 6 % % % %)@ GC G GA4S> A  tCH~     nd38n n n :d3
?&; : : 10nc3h n$ F F =   ' ' ' .2-1*/+0',#-(-(  $d{-(  $d{	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^  7# 7#* 76d3+;&< 6WZ 6?d ?s ?*tCy t `cfjknfo`o 4vS vd
 vt vjnorjs vI9)I9 0I9 	I9
 I9  $JI9 DjI9\ gk#',;,F,F2D2T2T!%$))-#'&*-1-1*/+0',#,0)X++d9o=EV@WWX 004	?BTJ[E\\_ccX !	X
 *X 0X $JX X "X  $JX DjX tX  $d{X  $d{X $(X  %)!X" !%#X$ %X& 'X( #Tk)X, 
-Xt
tCy 
S 
 %*48	(c?( "( '+Tk	( 
(\ &*&*bkk) #s(O d{	
 t 
sCx* CJ 
 } }rJ   r'   )Ar  r   r^   rW   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr  r  r  r   r   r   r	   r
   r  r   r[   tokenizers.decodersr   r  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r  r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr   r    r!   
get_loggerrb   rl   r  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr  r   r'   PreTrainedTokenizerFastrT   rJ   rI   <module>r     s   
   	 # $   7 + - / 1 6 * ^ ^ . 5 =   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-C/ C .CN& , rJ   