
    笜il&                    N   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZ d	d
lmZmZmZmZ d	dlmZ  ej8                  e      Zg dZeg dz   Z ddZ!de"de#fdZ$ddee#   dz  fdZ% G d d      Z& G d de&      Z'de#de"fdZ( G d d      Z) G d de)      Z* G d de)      Z+ G d  d!e)      Z, G d" d#e)      Z- G d$ d%e)      Z. G d& d'e)      Z/ G d( d)e)      Z0 G d* d+e)      Z1 G d, d-e)      Z2 G d. d/e)      Z3 G d0 d1e)      Z4 G d2 d3e)      Z5 G d4 d5e5      Z6 G d6 d7e5      Z7 G d8 d9e5      Z8 G d: d;e5      Z9 G d< d=e5      Z: G d> d?e5      Z; G d@ dAe5      Z< G dB dCe5      Z= G dD dEe5      Z> G dF dGe5      Z? G dH dIe5      Z@ G dJ dKe5      ZA G dL dMe5      ZB G dN dOe5      ZC G dP dQe5      ZD G dR dSe5      ZE G dT dUe)      ZF G dV dWe5      ZG G dX dYe)      ZH G dZ d[e)      ZI G d\ d]e)      ZJ G d^ d_e5      ZK G d` dae5      ZL G db dce5      ZM G dd dee)      ZN G df dge5      ZO G dh die5      ZP G dj dke5      ZQdl ZR G dm dn      ZS G do dp      ZTi dqe6dre2dse7dte*dueGdveJdwe8dxeHdye/dze*d{e4d|e9d}e*d~e*de*de*de*i de6de,de/de0de*de*de2de>de2de2de*deNde:de;de-de*de2i de<de.deCde1de@deAde2de3de=de*deDdeEdeFde>de?de+deKeMeMeLeMdZUddefdZVy)z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)
Collection)	lru_cache)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CN)af_ZAaz_AZbn_INfa_IRhe_ILhr_HRid_IDka_GEkm_KHmk_MKml_INmn_MNmr_INpl_PLps_AFpt_XXsv_SEsw_KEta_INte_INth_THtl_XXuk_UAur_PKxh_ZAgl_ESsl_SIc                    t               rddlm} |S t               rSdd l}t        j                  |j                  j                        t        j                  d      k  rddl	m} |S ddl	m
} |S t        t        j                  |             )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecerL   r   google.protobufr   parseprotobuf__version__transformers.utilsrM   ImportErrorr   format)error_messagerL   googles      _/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufrY   _   sl    !#9&&==445g8NNB '& b&&/66}EFF    add_prefix_spacereturnc                 4    | rd}t        |dd      sd}|S d}|S )NalwayslegacyTfirstnever)getattr)r[   original_tokenizerprepend_schemes      rX   _get_prepend_schemere   p   s1    !)8T:$N  !rZ   skip_tokensc                     |t        |      n	t               }|d u}|rt        |      n }g }|j                         D ]x  \  }}||v rg }t        dt	        |            D ]3  }|d | ||d  }
}	|	|v s|
|v r|	 v s|
 v s |j                  |	|
|f       5 t        | fd      }|j                  |       z t        |d |      }|D cg c]  }|d   |d   f }}|S c c}w )Nr   c                 $    | d      | d      fS Nr   r    )xvocabs    rX   <lambda>z!generate_merges.<locals>.<lambda>   s    U1Q4[%!+,F rZ   keyc                 B    | d   t        | d         t        | d         fS )N   r   r   )lenvals    rX   rm   z!generate_merges.<locals>.<lambda>   s!    SVSQ[#c!f+,N rZ   ro   reverser   )setdictitemsrangerr   appendsortedextend)rl   vocab_scoresrf   rv   mergesmergepiece_scorelocalindexpiece_lpiece_rrt   s   `           rX   generate_mergesr   z   s   &1&=#k"35K$&G)04%eLF*002 {K1c%j) 	>E$Ve}eEFmWG+%K)?%Gu$4gw<=	> u"FGe F NX_`F*013s1vs1v1F1M 2s   C!c                   B    e Zd ZdZdefdZdeeeef   e	e   f   fdZ
y)SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                    t        | d       t        | d       t               }|j                         }t        |d      5 }|j	                  |j                                d d d        || _        y # 1 sw Y   || _        y xY w)NrN   rQ   rb)r   rY   
ModelProtoopenParseFromStringreadproto)selfr   	model_pb2mfs        rX   __init__zSentencePieceExtractor.__init__   sl    $0$
+ $%	  "% 	(!affh'	(
	(
s    A//A?r\   c           
      L   | j                   j                  j                   |/ddlm}m} | j                   j                  j                  dk(  r|n|}| j                   j                  D cg c]  }|j                  |j                  f }}|j                  dk7  r)| j                   j                  j                  |d<   ||d<   n;ddlm} t        |      D 	
ci c]  \  }\  }	}
|	| }}	}}
 ||      }||d<   ||d<   t        | j                   j                        D cg c]0  \  }}|j                  d	v s||j                  |j                  d
k(  f2 }}}t        |d       D cg c]  \  }}}t!        |d|       c}}}|d<   |S c c}w c c}
}	}w c c}}w c c}}}w )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        r   )r   r   r   r   unk_idrl   )r   r         r   c                     | d   S Nr   rj   rk   s    rX   rm   z0SentencePieceExtractor.extract.<locals>.<lambda>   s
    QqT rZ   rn   F
normalizedspecialadditional_special_tokens)r   trainer_specr   tokenizers.modelsr   r   
model_typepiecespiecescore__name__tokenization_utils_baser   	enumeratetyper|   r   )r   r   kwargsr   r   r   rl   r   iwordr   r   idpspm_added_tokenstokenr   s                    rX   extractzSentencePieceExtractor.extract   s   
 	

&&6$(JJ$;$;$F$F!$KQTJ9=9J9JK%++u{{+KK%'#zz66==F8#F7O@5>u5EFF!1MT5T1WFEF$U+F#F7O%F8 ENdjjN_N_D`u52qdedjdjntdtR!&&A+6uu '--=>&R/
 /
"E7 u@/
*+ - L G v/
s   *FF F7F*FN)r   
__module____qualname____doc__strr   tuplerx   intlistr   rj   rZ   rX   r   r      s5    
c 
 uT#s(^T%[5P/Q  rZ   r   c                   4    e Zd Zddeeeef   ee   f   fdZy)GemmaSentencePieceExtractorNr\   c                     | j                   }t        |j                               D ci c]  }|j                  |      | }}d|vr|j	                  d      |d<   t        ||      }||fS c c}w )r   	<0x09>)sprz   GetPieceSizeid_to_piecegetr   )r   r~   r   r   rl   r   s         rX   r   z#GemmaSentencePieceExtractor.extract   sr    
 WW;@AR;ST%&-TT u))H-E$K 5f} Us   A+N)	r   r   r   r   rx   r   r   r   r   rj   rZ   rX   r   r      s$    E$sCx.$u+2M,N rZ   r   r   c                 ^    t        |       dk  xs | d   dk7  xs | d   j                          S )Nrq   ,)rr   isdigit)r   s    rX   check_number_commar      s3    u:>HU2Y#-HU2Y5F5F5H1HHrZ   c                       e Zd Zd ZdefdZy)	Converterc                     || _         y r   )rc   )r   rc   s     rX   r   zConverter.__init__   s
    "4rZ   r\   c                     t               r   )NotImplementedErrorr   s    rX   	convertedzConverter.converted   s    !##rZ   N)r   r   r   r   r   r   rj   rZ   rX   r   r      s    5$9 $rZ   r   c                       e Zd ZdefdZy)BertConverterr\   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixrc   rl   r   r   r   r   hasattrr   tokenize_chinese_charsr   do_lower_caser
   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr	   decoder
r   rl   	tokenizerr   r   r   clssepr   r   s
             rX   r   zBertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	rZ   Nr   r   r   r   r   rj   rZ   rX   r   r          #9 #rZ   r   c                       e Zd ZdefdZy)SplinterConverterr\   c           
         | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }t	        | j                   j&                        }d}	| j                   j(                  }
| j                   j*                  }| j                   j,                  }| j                   j/                  d      }| j                   j0                  dk(  r| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}t3        j4                  | d| d|||
f||f||f|	|fg      |_        t9        j                  d      |_        |S )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )rc   rl   r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r	   r   )r   rl   r   r   r   r   r   r   questiondotr   r   r  dot_token_idr   s                  rX   r   zSplinterConverter.converted  s"   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*l#l#,-l#		$
	  %..d;	rZ   Nr   rj   rZ   rX   r   r   
  s    .9 .rZ   r   c                       e Zd ZdefdZy)FunnelConverterr\   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             rX   r   zFunnelConverter.converted=  r   rZ   Nr   rj   rZ   rX   r  r  <  r   rZ   r  c                       e Zd ZdefdZy)MPNetConverterr\   c                 r   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	| d
||f||	fg      |_        t1        j                  d      |_        |S )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             rX   r   zMPNetConverter.convertedd  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=l#l#$
	  %..d;	rZ   Nr   rj   rZ   rX   r  r  c  r   rZ   r  c                       e Zd ZdefdZy)OpenAIGPTConverterr\   c           
         | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d t        |      dd            }|j                  t        |            |j                  t        |      g       t        j                  d      |_        t        j                         |_        t#        j$                  d      |_        |S )N</w>F)rl   r   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)rc   encoderr   	bpe_rankskeysr   r   r   r   token_to_idadd_special_tokensr
   r   r   r   r   r   r	   
BPEDecoderr   r   rl   r   r   r   s        rX   r   zOpenAIGPTConverter.converted  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	rZ   Nr   rj   rZ   rX   r  r    s    9 rZ   r  c                   H    e Zd Zddeeef   dz  deeeef      dz  defdZ	y)GPT2ConverterNrl   r   r\   c           
      N   |s| j                   j                  }|st        | j                   j                        }t	        t        ||d ddd            }t        | j                   dd      }t        j                  |      |_	        t        j                         |_        t        | j                   dd      rT| j                   j                  }| j                   j                  }t        j                  | d| d||fg	      |_        |S t        j                  d
      |_        |S )N Frl   r   r  continuing_subword_prefixr  r  r[   r[   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)rc   r  r   r  r   r   rb   r   	ByteLevelr   r	   r   	bos_tokenbos_token_idr   r   r   )r   rl   r   r   r[   bosr.  s          rX   r   zGPT2Converter.converted  s   ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUC))33C22??L'1'D'DguL),' (I$  (2';';'OI$rZ   NN
r   r   r   rx   r   r   r   r   r   r   rj   rZ   rX   r#  r#    s@    "tCH~4 "T%PSUXPX/EZ]aEa "mv "rZ   r#  c                       e Zd ZdefdZy)HerbertConverterr\   c           	         d}d}| j                   j                  }t        | j                   j                  j	                               }||d   d   v r|dd  }t        t        ||d | j                   j                  |            }t        j                  dd      |_
        t        j                         |_        t        j                  |      |_        t#        j$                  | j                   j&                  | j                   j(                  f| j                   j*                  | j                   j,                  f	      |_        |S )
Nz	#version:r  r   r   )r  r   r  F)r   r   r  )r   r   )rc   r  r   r  r  r   r   r   r
   r   r   r   r   r   r	   r   r   r   BertProcessingr   r   r   r   r   )r   tokenizer_info_strtoken_suffixrl   r   r   s         rX   r   zHerbertConverter.converted  s   (''//d--77<<>?1-ABZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 rZ   Nr   rj   rZ   rX   r3  r3        9 rZ   r3  c                   H    e Zd Zddeeef   dz  deeeef      dz  defdZ	y)Qwen2ConverterNrl   r   r\   c                 0   |s| j                   j                  }|s-t        | j                   j                  j	                               }t        t        ||d d dddd            }t        j                         |_	        t        j                  t        j                  t        d      dd      t        j                  t        | j                   dd      d      g      |_        t#        j                         |_        t'        j                  d	      |_        |S )
Nr%  F)rl   r   r  r   r'  r  r  byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr[   r[   	use_regexr*  )rc   r  r   r  r  r   r   r
   NFCr   r   SequenceSplitr   r,  rb   r   r	   r   r   r   )r   rl   r   r   s       rX   r   zQwen2Converter.converted  s   ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 rZ   r0  r1  rj   rZ   rX   r:  r:    s@    (tCH~4 (T%PSUXPX/EZ]aEa (mv (rZ   r:  c                       e Zd ZdefdZy)RobertaConverterr\   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  |j                  |j                   f|j"                  |j$                  f|j                  d      |_        |S )Nr%  Fr&  r(  Tr   r   r[   r+  )rc   r  r   r  r  r   r   r   r,  r[   r   r	   r   r   RobertaProcessingr   r   r   r   r   r   otrl   r   r   s        rX   r   zRobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  rZ   Nr   rj   rZ   rX   rG  rG        9 rZ   rG  c                       e Zd ZdefdZy)RoFormerConverterr\   c           	      V   ddl m} | j                  j                  }t	        t        |t        | j                  j                                    }d}d}t        | j                  d      r@| j                  j                  j                  }| j                  j                  j                  }t        j                  dd||      |_        t        j                   j#                   ||            |_        t        | j                  j&                        }t        | j                  j(                        }| j                  j*                  }| j                  j,                  }	t/        j0                  | d| d	| d| d
| d||f||	fg      |_        t5        j
                  d      |_        |S )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsrQ  rc   rl   r   r   r   r   r   r   r   r   r
   r   r   r   PreTokenizercustomr   r   r   r   r   r   r   r   r	   r   )
r   rQ  rl   r   r   r   r   r   r   r   s
             rX   r   zRoFormerConverter.converted4  sy   I''--iT=T=T=^=^9_`a	4**,=> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	rZ   Nr   rj   rZ   rX   rO  rO  3  r   rZ   rO  c                       e Zd ZdefdZy)DebertaConverterr\   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  ddd| j                   j                  d      fd| j                   j                  d      fg	      |_        |S )
Nr%  Fr&  r(  [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )rc   r  r   r  r  r   r   r   r,  r[   r   r	   r   r   r   r  r   rK  s        rX   r   zDebertaConverter.converted[  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  rZ   Nr   rj   rZ   rX   rV  rV  Z  r8  rZ   rV  c                   r     e Zd ZdZeZi Zedd       Z fdZ	d Z
d Zd Zd Zd Zd	 Zd
 ZdefdZ xZS )SpmConverterFc                     |||d<   |S )z
        Hook used when converting directly from a SentencePiece model without a slow tokenizer instance.
        By default, return kwargs unchanged.
        rl   rj   )r   rl   r   s      rX   convert_from_spmzSpmConverter.convert_from_spm~  s     #F7OrZ   c                    t        | d       t        |   |  t               }|j	                         }t        | j                  j                  d      5 }|j                  |j                                d d d        || _
        | j                  j                  j                  r#| j                  st        j                  d       y y y # 1 sw Y   TxY w)NrQ   r   a  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr   rY   r   r   rc   
vocab_filer   r   r   r   r<  handle_byte_fallbackwarningswarn)r   argsr   r   r   	__class__s        rX   r   zSpmConverter.__init__  s    $
+$ $%	  "$))44d; 	(qaffh'	(
::""009R9RMMe :S0		( 	(s    CCc                 l    |j                   D cg c]  }|j                  |j                  f c}S c c}w r   r   r   r   r   r   r   s      rX   rl   zSpmConverter.vocab  s'    8=Euekk*EEEs   1c                 .    |j                   j                  S r   )r   r   r   r   s     rX   r   zSpmConverter.unk_id  s    !!(((rZ   c                 ~   |j                   j                  }| j                  |      }|dk(  r1t        t	        || j                  |      | j                              }n|dk(  r| j                  | j                  j                        j                  |      \  }}t        |      D 	ci c]  \  }\  }}	|| }
}}}	t        t        |
||j                   j                  d| j                  d             }nt        d      t        |j                        D cg c]I  \  }}|j                   dv r6||j"                  |j                   dk(  xs |j"                  | j$                  v fK }}}|j'                  t)        |d	 
      D cg c]  \  }}}t+        |d|       c}}}       |S c c}	}}w c c}}w c c}}}w )Nr   r   r<  rq   Tr   r  r<  r  z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmr   r   c                     | d   S r   rj   r   s    rX   rm   z(SpmConverter.tokenizer.<locals>.<lambda>      QRSTQU rZ   rn   Fr   )r   r   rl   r   r   r   rc  SpmExtractorrc   rb  r   r   r   	unk_piece	Exceptionr   r   r   r   
add_tokensr|   r   )r   r   r   r~   r   _r   r   r   r   	bpe_vocabr   r   r   r   r   s                   rX   r   zSpmConverter.tokenizer  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIAv9B<9PQQ%5QuqQIQ!#00::!"&";"; 	I o  #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGD	
 C R*
s   )F+AF2F8c                     |j                   j                  }t        j                  dd      t        j                  t        d      d      g}|st        j                  |      S t        j                  t        j                  |      g|z         S )NFT)leftr   {2,}   ▁)normalizer_specprecompiled_charsmapr
   StripReplacer   rD  Precompiledr   r   r}  _normalizerss       rX   r   zSpmConverter.normalizer  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggrZ   c                 \    t        || j                        }t        j                  ||      S Nreplacementrd   )re   rc   r   	Metaspacer   r  r[   rd   s       rX   r   zSpmConverter.pre_tokenizer  s)    ,-=t?V?VW''KP^__rZ   c                      y r   rj   r   s    rX   r   zSpmConverter.post_processor  s    rZ   c                 \    t        || j                        }t        j                  ||      S r  )re   rc   r	   r  r  s       rX   r   zSpmConverter.decoder  s(    ,-=t?V?VW!!k.YYrZ   r\   c                 z   | j                  | j                        }| j                  | j                        }|||_        d}d}t        | j                  d      r| j                  j
                  }| j                  ||      }|||_        | j                  ||      |_        | j                         }|r||_        |S )Nr{  Tr[   )	r   r   r   r   rc   r[   r   r   r   )r   r   r   r  r[   r   r   s          rX   r   zSpmConverter.converted  s    NN4::.	 __TZZ0
!#-I 4**,>?#66GG**;8HI$&3I# LL6FG	,,.'5I$rZ   r   )r   r   r   rc  r   rr  r   classmethodr_  r   rl   r   r   r   r   r   r   r   r   __classcell__)rg  s   @rX   r]  r]  y  s^     )LN *F)0d	h`Z9 rZ   r]  c                       e Zd Zd Zd Zd Zy)AlbertConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w Nd   r   r   r   r   rj  s      rX   rl   zAlbertConverter.vocab  ^     
 +=U[[*IU[[%++&PUP[P[]b]h]hkn]nOoo
 	
 
   AA!c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S Nz``"z''rz  r  r
   r  rc   keep_accentsr{   NFKDStripAccentsr   	Lowercaser|  r}  r  r   rD  r   r   list_normalizersr}  s       rX   r   zAlbertConverter.normalizer      c*c*
 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S NrX  rY  rZ  r[  r   r   r   rc   r  r   s    rX   r   zAlbertConverter.post_processor  R    ,,)4$11GGPQ$11GGPQ
 	
rZ   Nr   r   r   rl   r   r   rj   rZ   rX   r  r        
6&
rZ   r  c                       e Zd Zd Zd Zy)BarthezConverterc                 
    d}|S Nr   rj   r   r   r   s      rX   r   zBarthezConverter.unk_id*      rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   s    rX   r   zBarthezConverter.post_processor.  R    ,, +//EEeLM00FFvNO
 	
rZ   N)r   r   r   r   r   rj   rZ   rX   r  r  )  s    
rZ   r  c                   0    e Zd Zd Zd Zd Zedd       Zy)CamembertConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )N)z
<s>NOTUSED        <pad>r  z</s>NOTUSEDr  <unk>r  )<unk>NOTUSEDir   <mask>r  ri  r   r   rl   r   s       rX   rl   zCamembertConverter.vocab:  sP    
 	%,,qr:JK5;;,KK/"" L   Ac                      yr  rj   rl  s     rX   r   zCamembertConverter.unk_idG  s    rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S r  r  r   s    rX   r   z!CamembertConverter.post_processorK  r  rZ   Nc                 ,   t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }d|dfd	|dfd
g}||j                  t        |      dd         |j	                  |df       ||d<   |S )N	pad_tokenr  r   r  
mask_tokenr  r  r  r  )r        Yr   rl   r   r   r}   r   r{   )r   rl   r   r  r   r  
vocab_lists          rX   r_  z#CamembertConverter.convert_from_spmU  s    

;89	

;89	L(;<
   $

 d5k!"o.:s+,$wrZ   r   r   r   r   rl   r   r   r  r_  rj   rZ   rX   r  r  9  s%    
  rZ   r  c                       e Zd Zd Zd Zd Zy)DebertaV2Converterc                    g }| j                   j                  r%|j                  t        j                  d             t        || j                         }|j                  t        j                  ||             t        j                  |      S )Nr=  )r?  r  )rc   split_by_punctr{   r   Punctuationre   r  rD  )r   r  r[   list_pretokenizersrd   s        rX   r   z DebertaV2Converter.pre_tokenizerj  sq    ""11%%n&@&@*&UV,-=t?V?VW!!.":":{cq"rs&&'9::rZ   c                    g }| j                   j                  r#|j                  t        j                                |j                  t        j
                                |j                  j                  }|r$|j                  t        j                  |             |j                  t        j                  t        d      d             t        j                  |      S )Nrz  r  )rc   r   r{   r
   r  r~  r|  r}  r  r  r   rD  r  s       rX   r   zDebertaV2Converter.normalizerr  s    ""00##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S r  r  r   s    rX   r   z!DebertaV2Converter.post_processor  r  rZ   N)r   r   r   r   r   r   rj   rZ   rX   r  r  i  s    ;6
rZ   r  c                   0    e Zd Zd Zd Zd Zedd       Zy)MBartConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nr  r  r  r  r  r  r   )r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r    r  r!   r  r"   r  r#   r  r$   r  r%   r  r&   r  r'   r  r(   r  r)   r  r*   r  r+   r  r,   r  r-   r  r.   r  r/   r  r  ri  r  s       rX   rl   zMBartConverter.vocab  sa    
 	%,,qr:JK5;;,KK 
 	
6 	/""; L   A
c                      yr  rj   rl  s     rX   r   zMBartConverter.unk_id      rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A </s> en_XXz$A $B </s> en_XXr   r  r   r  r   s    rX   r   zMBartConverter.post_processor  R    ,,"#$11GGPQ00FFvNO
 	
rZ   Nc                    t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }t        |j                  d	d
            }|df|df|df|dfg}||j                  t        |      dd         |j                  d t        D               |j                  |df       ||d<   |S )Nr-  r  r  r  	eos_tokenr  r   r  r  r  r  r   c              3   $   K   | ]  }|d f 
 ywr  Nrj   .0	lang_codes     rX   	<genexpr>z2MBartConverter.convert_from_spm.<locals>.<genexpr>  s     Ly9c*L   rl   )r   r   r}   r   MBART_LANGUAGESr{   	r   rl   r   r-  r  r  r   r  r  s	            rX   r_  zMBartConverter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k!"o.LOLL:s+,$wrZ   r   r  rj   rZ   rX   r  r    s&    $L
  rZ   r  c                   0    e Zd Zd Zd Zd Zedd       Zy)MBart50Converterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nr  r   )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )r0   r  )r1   r  )r2   r  )r3   r  )r4   r  )r5   r  )r6   r  )r7   r  )r8   r  )r9   r  )r:   r  )r;   r  )r<   r  )r=   r  )r>   r  )r?   r  )r@   r  )rA   r  )rB   r  )rC   r  )rD   r  )rE   r  )rF   r  )rG   r  )rH   r  )rI   r  )rJ   r  r  ri  r  s       rX   rl   zMBart50Converter.vocab  sa    
 	%,,qr:JK5;;,KK  R  	R/"" Lr  c                      yr  rj   rl  s     rX   r   zMBart50Converter.unk_id  r  rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzen_XX $A </s>zen_XX $A $B </s>r   r  r   r  r   s    rX   r   zMBart50Converter.post_processor  r  rZ   Nc                    t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }t        |j                  d	d
            }|df|df|df|dfg}||j                  t        |      dd         |j                  d t        D               |j                  |df       ||d<   |S )Nr   r  r  r  r  r  r   r  r  r  r  r   c              3   $   K   | ]  }|d f 
 ywr  rj   r  s     rX   r  z4MBart50Converter.convert_from_spm.<locals>.<genexpr>  s     Ny9c*Nr  rl   )r   r   r}   r   MBART50_LANGUAGESr{   )	r   rl   r   r   r  r  r   r  r  s	            rX   r_  z!MBart50Converter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k!"o.N<MNN:s+,$wrZ   r   r  rj   rZ   rX   r  r    s%    

  rZ   r  c                   0    e Zd Zd Zd Zd Zedd       Zy)NllbConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr  r   ri  r  s       rX   rl   zNllbConverter.vocab  C    
 	%,,qr:JK5;;,KK L   =c                      yr  rj   rl  s     rX   r   zNllbConverter.unk_id  r  rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   s    rX   r   zNllbConverter.post_processor  sR    ,,%&T44JJ:VW00FFvNO
 	
rZ   Nc                    t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }|d	|d
|d|di}|Ot        |t              r|j	                         n|D 	cg c]  \  }}	|	 c}	}}
|
D ]  }||v rt        |      ||<    ||d<   |S c c}	}w )Nr-  r  r  r  r  r  r   r  r   r   rq   r   rl   )r   r   
isinstancerx   r  rr   )r   rl   r   r-  r  r  r   reordered_vocabtokrv  tokensr   s               rX   r_  zNllbConverter.convert_from_spm  s    

;67	

;89	

;78	

;89	 qqqq	
 %/t%<UZZ\UZB[63PQ3B[F >O+),_)=&> *w C\s   Cr   r  rj   rZ   rX   r
  r
    s%    
  rZ   r
  c                       e Zd Zd Zd Zd Zy)SeamlessM4TConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )N)r  r  r  r  r   ri  r  s       rX   rl   zSeamlessM4TConverter.vocab4  r  r  c                 .    | j                   j                  S r   )rc   unk_token_idrl  s     rX   r   zSeamlessM4TConverter.unk_id>  s    &&333rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   s    rX   r   z#SeamlessM4TConverter.post_processorA  sR    ,,$%D33II)TU00FFvNO
 	
rZ   Nr   r   r   rl   r   r   rj   rZ   rX   r  r  3  s    4
rZ   r  c                   0    e Zd Zd Zd Zd Zedd       Zy)XLMRobertaConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )Nr  r   r  ri  r  s       rX   rl   zXLMRobertaConverter.vocabM  sP    
 	%,,qr:JK5;;,KK/"" Lr  c                 
    d}|S r  rj   r  s      rX   r   zXLMRobertaConverter.unk_idX  r  rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S r  r  r   s    rX   r   z"XLMRobertaConverter.post_processor\  r  rZ   Nc                    t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }t        |j                  dd            }t        |j                  d	d
            }|df|df|df|dfg}||j                  t        |      dd         |j	                  |df       ||d<   |S )Nr-  r  r  r  r  r  r   r  r  r  r  r   rl   r  r   s	            rX   r_  z$XLMRobertaConverter.convert_from_spmf  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k!"o.:s+,$wrZ   r   r  rj   rZ   rX   r  r  L  s%    	
  rZ   r  c                       e Zd Zd Zd Zd Zy)XLNetConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w r  r  rj  s      rX   rl   zXLNetConverter.vocab|  r  r  c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S r  r  r  s       rX   r   zXLNetConverter.normalizer  r  rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   s    rX   r   zXLNetConverter.post_processor  r  rZ   Nr  rj   rZ   rX   r%  r%  {  r  rZ   r%  c                       e Zd Zy)ReformerConverterNr   r   r   rj   rZ   rX   r*  r*        rZ   r*  c                       e Zd Zd Zd Zy)RemBertConverterc                 b   t        j                  dd      t        j                  dd      t        j                  t        d      d      g}| j                  j                  sF|j                  t        j                                |j                  t        j                                | j                  j                  r#|j                  t        j                                |j                  j                  }|r$|j                  t        j                  |             t        j                  |      S r  )r
   r  r   rc   r  r{   r  r  r   r  r|  r}  r  rD  r  s       rX   r   zRemBertConverter.normalizer  s    c*c*g4

 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR##$455rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S r  r  r   s    rX   r   zRemBertConverter.post_processor  r  rZ   N)r   r   r   r   r   rj   rZ   rX   r.  r.    s    6&
rZ   r.  c                       e Zd Zy)BertGenerationConverterNr+  rj   rZ   rX   r2  r2    r,  rZ   r2  c                   $    e Zd Zd Zd Zd Zd Zy)PegasusConverterc                 v   | j                   j                  df| j                   j                  dfg}| j                   j                  || j                   j                  dfgz  }| j                   j                  I| j                   j
                  | j                   j                  k  r|| j                   j                  dfgz  }|t        d| j                   j                        D cg c]
  }d| ddf c}z  }||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w c c}w )Nr  rq   z<unk_>r  )rc   r  r  mask_token_sentr  mask_token_idoffsetrz   r   r   r   )r   r   rl   r   r   s        rX   rl   zPegasusConverter.vocab  s%   $$..4$$..4

 ""22>t..>>DEEE ##..:''558O8O8V8VVt..993?@@E%4;R;R;Y;Y2Z[QU1#Q<([[%,,qr:JK5;;,KK \Ks   %D1D6c                 \    |j                   j                  | j                  j                  z   S r   )r   r   rc   r9  rl  s     rX   r   zPegasusConverter.unk_id  s%    !!((4+B+B+I+IIIrZ   c                     t        || j                        }t        j                  t        j                         t        j
                  ||      g      S r  )re   rc   r   rD  WhitespaceSplitr  r  s       rX   r   zPegasusConverter.pre_tokenizer  sJ    ,-=t?V?VW&&..0(([Q_`
 	
rZ   c                     | j                   j                  }|| j                   j                  fg}t        j                  d|gdd|g|      S )N$A$Br   )rc   r  eos_token_idr   r   )r   eosr   s      rX   r   zPegasusConverter.post_processor  sR    %%//$))667
 ,,T3KtTSVFWhvwwrZ   N)r   r   r   rl   r   r   r   rj   rZ   rX   r4  r4    s    &J
xrZ   r4  c                   *    e Zd Zd Zd Zedd       Zy)T5Converterc                     | j                   j                  }|j                  D cg c]  }|j                  |j                  f }}|t        |dz
  dd      D cg c]
  }d| ddf c}z  }|S c c}w c c}w )Nr   r   
<extra_id_r6  r  )rc   
_extra_idsr   r   r   rz   )r   r   num_extra_idsr   rl   r   s         rX   rl   zT5Converter.vocab  sw    //::9>F%++u{{+FFE-!:KRQS4TUqZs!$c*UU GUs   A/A4c                 r    t        j                  ddgg dd| j                  j                  d      fg      S Nr>  r  )r>  r  r?  r  r   r  r   s    rX   r   zT5Converter.post_processor  =    ,,&>-00FFvNO
 	
rZ   Nc                     |j                  dd      }t        |dz
  dd      D cg c]  }d| d
 }}|t        |      ng }|j                  d |D               |j	                  d|       ||d	<   |S c c}w )
N	extra_idsr  r   r   rE  r6  c              3   $   K   | ]  }|d f 
 ywr  rj   )r  r   s     rX   r  z/T5Converter.convert_from_spm.<locals>.<genexpr>  s     A55#,Ar  r   rl   )r   rz   r   r}   
setdefault)r   rl   r   rL  r   extra_tokensr  s          rX   r_  zT5Converter.convert_from_spm   s    JJ{C0	38QB3OPa*QCq)PP$)$5T%[2
ALAA5|D$w Qs   A4r   )r   r   r   rl   r   r  r_  rj   rZ   rX   rC  rC    s     
  rZ   rC  c                       e Zd Zd Zy)UdopConverterc                 r    t        j                  ddgg dd| j                  j                  d      fg      S rI  r  r   s    rX   r   zUdopConverter.post_processor  rJ  rZ   Nr   r   r   r   rj   rZ   rX   rQ  rQ    s    
rZ   rQ  c                       e Zd ZdefdZy)WhisperConverterr\   c           
         | j                   j                  }t        | j                   j                  j	                               }t        t        ||d ddd            }t        j                  | j                   j                        |_
        t        j                         |_        | j                   j                  }| j                   j                  |      }| j                   j                  }| j                   j                   }dj#                  |D cg c]  }| d	 c}      }	t%        j&                  |	 d| d|	 d| d	||fgt)        ||      
      |_        |S c c}w )Nr%  Fr&  r(  r  r   z $A:0 z $A:0 $B:1 r   r   )rc   r  r   r  r  r   r   r   r,  r[   r   r	   r   prefix_tokensconvert_ids_to_tokensr  r@  joinr   r   zipr   )
r   rl   r   r   prefix_token_idsprefixesrA  r@  r   prefix_templates
             rX   r   zWhisperConverter.converted  sR   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GUugRL#GH#-#@#@%&fSE4#$KuB7l#X/0$
	   $Hs   ENr   rj   rZ   rX   rU  rU    s     9  rZ   rU  c                       e Zd Zd Zy)BigBirdConverterc           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S r  r  r   s    rX   r   zBigBirdConverter.post_processor<  r  rZ   NrS  rj   rZ   rX   r_  r_  ;  s    
rZ   r_  c                       e Zd ZdefdZy)CLIPConverterr\   c                 p   | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d dddt        |                  }t        j                  t        j                         t        j                  t        d      d      t        j                         g      |_        t!        j                  t!        j"                  t        d      dd	
      t!        j$                  d      g      |_        t)        j$                         |_        t-        j.                  | j                   j0                  | j                   j2                  f| j                   j4                  | j                   j6                  fdd      |_        |S )Nr%  r  Frl   r   r  r'  r  r  r   z\s+r  z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr>  r(  rI  )rc   r  r   r  r  r   r   r   r   r
   rD  rC  r  r   r  r   r   rE  r,  r   r	   r   r   rJ  r  r@  r-  r.  r   r!  s        rX   r   zCLIPConverter.convertedH  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  rZ   Nr   rj   rZ   rX   rb  rb  G  s    '9 'rZ   rb  c                       e Zd ZdefdZy)LayoutLMv2Converterr\   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr   FTr   r   r   r   r   r   r   r   r   r   r   s
             rX   r   zLayoutLMv2Converter.converteds  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	rZ   Nr   rj   rZ   rX   rg  rg  r  r   rZ   rg  c                       e Zd ZdefdZy)BlenderbotConverterr\   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  d|j                   d|j                  |j                   fg      |_        |S )Nr%  Fr&  r(  z$A:0 r   )r   r   )rc   r  r   r  r  r   r   r   r,  r[   r   r	   r   r   r   r  r@  r   rK  s        rX   r   zBlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  rZ   Nr   rj   rZ   rX   rj  rj    rM  rZ   rj  c                       e Zd Zd Zd Zd Zy)XGLMConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|S c c}w )Nr  r   ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  ri  r  s       rX   rl   zXGLMConverter.vocab  sT    
 	%,,qr:JK5;;,KK  z  	z Ls   Ac                 
    d}|S r  rj   r  s      rX   r   zXGLMConverter.unk_id  r  rZ   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz</s> $Az</s> $A </s> </s> $Br  r  r   r  r   s    rX   r   zXGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
rZ   Nr  rj   rZ   rX   rm  rm    s    	
rZ   rm  c                   <    e Zd ZdZeZddhZ	 d Zd Zd Z	d Z
d Zy	)
GemmaConverterTz<start_of_turn>z<end_of_turn>c                 .    t        j                  dd      S Nr  r{  )r
   r  rl  s     rX   r   zGemmaConverter.normalizer  s    ""3..rZ   c                 t   | j                   j                  df| j                   j                  df| j                   j                  dfg}||j                  dd  D cg c]  }|j
                  |j                  f c}z  }t        d |D              s#t        d t        |      D        d       }|d||<   |S c c}w )Nr  r   c              3   ,   K   | ]  }|d    dk(    yw)r   r   Nrj   )r  rk   s     rX   r  z'GemmaConverter.vocab.<locals>.<genexpr>  s     /A1Q44</s   c              3   8   K   | ]  \  }}|d    dk(  s|  yw)r   r   Nrj   )r  r   rk   s      rX   r  z'GemmaConverter.vocab.<locals>.<genexpr>  s     "VAQqTXEU1"Vs   )r   r  )
rc   r  r  r-  r   r   r   anynextr   )r   r   rl   r   override_indexs        rX   rl   zGemmaConverter.vocab  s    $$..4$$..4$$..4

 	%,,qr:JK5;;,KK ///!"V51A"VX\]N)(3n% Ls   B5c                 .    t        j                  dd      S )Nr  merged_with_previous)r   rE  r   r  r[   s      rX   r   zGemmaConverter.pre_tokenizer  s    ##C)?@@rZ   c                 
    d}|S r  rj   r  s      rX   r   zGemmaConverter.unk_id  r  rZ   c                     t        j                  t        j                  dd      t        j                         t        j                         g      S )Nr{  r  )r	   rD  r  ByteFallbackFuser}  s      rX   r   zGemmaConverter.decoder  s?        ,%%'
 	
rZ   N)r   r   r   rc  r   rr  r   r   rl   r   r   r   rj   rZ   rX   rr  rr    s6    .L'9N/ A
rZ   rr  c                   4    e Zd ZdZd Zd Zd Zd Zd Zd Z	y)	LlamaConverterTc                 (   | j                   j                  d      df| j                   j                  d      df| j                   j                  d      dfg}||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr   r  r   rq   r   )rc   rX  r   r   r   r  s       rX   rl   zLlamaConverter.vocab  s    $$::1=sC$$::1=sC$$::1=sC

 	%,,qr:JK5;;,KK Ls   )Bc                 
    d}|S r   rj   r  s      rX   r   zLlamaConverter.unk_id  r  rZ   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S Nr{  r  r   )contentry  r	   r  r  r  r~  rD  r   r  r[   sequences       rX   r   zLlamaConverter.decoder  \    UC(!!#MMO

 !<==H  **rZ   c                     t        | j                  dd      rcg }t        | j                  dd      r|t        j                  d      gz  }|t        j                  dd      gz  }t        j
                  |      S y )Nr_   Tr[   r{  )prependr  )patternr  )rb   rc   r
   Prependr  rD  )r   r   r  s      rX   r   zLlamaConverter.normalizer  sr    4**Hd;Ht..0BDI[00?@@,,S%HIIH''11rZ   c                     t        | j                  dd      s.t        || j                        }t        j                  ||d      S y )Nr_   TFr  rd   split)rb   rc   re   r   r  r  s       rX   r   zLlamaConverter.pre_tokenizer(  sA    t..$?01A4CZCZ[N!++TbjopprZ   c                      y r   rj   r   s    rX   r   zLlamaConverter.post_processor.  s    rZ   N)
r   r   r   rc  rl   r   r   r   r   r   rj   rZ   rX   r  r    s&    +rZ   r  c                       e Zd ZdefdZy)MarkupLMConverterr\   c                    | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd| j                   j                              }t        j                  |j                        |_        t        j                         |_        t        | j                   j                        }t        | j                   j                         }| j                   j"                  }| j                   j$                  }t'        j(                  | d| | d| d| ||f||fg      |_        |S )Nr%  Frd  r(  z $A z $B r   )rc   r  r   r  r  r   r   r   r   r,  r[   r   r	   r   r   r   r   r   r   r   r   r   )	r   rL  rl   r   r   r   r   r   r   s	            rX   r   zMarkupLMConverter.converted4  s,   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+l#l#$
	  rZ   Nr   rj   rZ   rX   r  r  3  s    "9 "rZ   r  c                   (    e Zd ZdZd Zd Zd Zd Zy)MoshiConverterTc                    t        | d       t        j                  | |       t               }|j	                         }t        |d      5 }|j                  |j                                d d d        || _        y # 1 sw Y   || _        y xY wNrQ   r   	r   r   r   rY   r   r   r   r   r   r   rb  r   r   r   r   s         rX   r   zMoshiConverter.__init__\  sr    $
+4, $%	  "*d# 	(qaffh'	(
	(
   	 A99B	c                     |j                   j                  }t        j                  dd      g}|st        j                  |      S t        j                  t        j
                  |      g|z         S rt  )r|  r}  r
   r  rD  r  r  s       rX   r   zMoshiConverter.normalizeri  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggrZ   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S r  r  r  s       rX   r   zMoshiConverter.decoders  r  rZ   c                 6    d}t        j                  ||d      S )Nr`   Fr  )r   r  r  s       rX   r   zMoshiConverter.pre_tokenizer}  s     ''KP^fkllrZ   N)r   r   r   rc  r   r   r   r   rj   rZ   rX   r  r  Y  s    h+mrZ   r  c                   B    e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
d
 Zy)HeliumConverterTNc                    t        | d       t        j                  | |       t               }|j	                         }t        |d      5 }|j                  |j                                d d d        || _        y # 1 sw Y   || _        y xY wr  r  r  s         rX   r   zHeliumConverter.__init__  sp    $
+4,#%	  "*d# 	(qaffh'	(
	(
r  c                 V   | j                  |      }t        t        || j                  |      | j                              }t        |j                        D cg c]I  \  }}|j                  dv r6||j                  |j                  dk(  xs |j                  | j                  v fK }}}|j                  t        |d       D cg c]  \  }}}t        |d|d       c}}}       |j                  t        d	dd
      g       |j                  dd       |S c c}}w c c}}}w )Nrn  r   r   c                     | d   S r   rj   r   s    rX   rm   z+HeliumConverter.tokenizer.<locals>.<lambda>  rq  rZ   rn   FT)r   r   single_word
r   r  )r  pad_id)rl   r   r   r   rc  r   r   r   r   r   ru  r|   r   enable_padding)	r   r   r~   r   r   r   r   r   r   s	            rX   r   zHeliumConverter.tokenizer  s    zz%({{5)"77
	 #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGQUV	
 	j%OPQ  71 =
s   ADD$c                     g }|j                   D ]@  }|j                  dk(  r|d|j                  fgz  }%||j                  |j                  fgz  }B |S )Nz<0x0A>r  ri  r  s       rX   rl   zHeliumConverter.vocab  s]    \\ 	6E{{h&4-..5;;455		6
 rZ   c                 
    d}|S r   rj   r  s      rX   r   zHeliumConverter.unk_id  r  rZ   c                     t        j                  dd      t        j                         t        j                         g}|t        j                  dd      gz  }t        j
                  |      S r  r  r  s       rX   r   zHeliumConverter.decoder  sY    UC(!!#MMO

 	X^^Ca899  **rZ   c                 ~    t        j                  t        j                  d      t        j                  dd      g      S rt  )r
   rD  r  r  rl  s     rX   r   zHeliumConverter.normalizer  s2    ##[%8%8%={?R?RSWY^?_$`aarZ   c                 V    t        j                  t        j                  dd      g      S )Nr  
contiguous)r   rD  rE  r}  s      rX   r   zHeliumConverter.pre_tokenizer  s#    &&(<(<T<(P'QRRrZ   c                 <    t        j                  ddgg ddg      S )Nr  r>  )r  r>  r  r?  )r  r   r   )r   r   r   s    rX   r   zHeliumConverter.post_processor  s/    ,, 
 	
rZ   r   )r   r   r   rc  r   r   rl   r   r   r   r   r   rj   rZ   rX   r  r    s2    
8+bS
rZ   r  c                       e Zd ZdZddZd Zy)ParakeetConverterTNc                 &   || _         t        | d       t        j                  | |       t	               }|j                         }t        |d      5 }|j                  |j                                d d d        || _	        y # 1 sw Y   || _	        y xY wr  )
rb  r   r   r   rY   r   r   r   r   r   )r   rb  rf  r   r   r   s         rX   r   zParakeetConverter.__init__  sw    $$
+4,#%	  "*d# 	(qaffh'	(
	(
s    B  Bc                    | j                  |      }| j                  | j                        j                  |      \  }}t	        |      D ci c]  \  }\  }}|| }}}}t        t        |||j                  j                  d| j                  d             }	t	        |j                        D 
cg c]I  \  }
}|j                  dv r6|
|j                  |j                  dk(  xs |j                  | j                  v fK }}
}|	j                  t        |d       D 
cg c]  \  }
}}t!        |d|       c}}}
       |	S c c}}}w c c}}
w c c}}}
w )	NTro  r   r   c                     | d   S r   rj   r   s    rX   rm   z-ParakeetConverter.tokenizer.<locals>.<lambda>  rq  rZ   rn   Fr   )rl   rr  rb  r   r   r   r   r   rs  rc  r   r   r   r   ru  r|   r   )r   r   r~   rv  r   r   r   r   rw  r   r   r   r   r   r   s                  rX   r   zParakeetConverter.tokenizer  sM   zz%(%%doo6>>|L	65>|5LMM!1MT5T1WM	M,,66"77	
	 #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGD	
 3 N
s   E2AE Er   )r   r   r   rc  r   r   rj   rZ   rX   r  r    s    rZ   r  c            	         t        t        t        d      t        d      dz               t        t        t        d      t        d      dz               z   t        t        t        d      t        d      dz               z   } | dd }d	}t        d
      D ]1  }|| vs| j                  |       |j                  d
|z          |dz  }3 |D cg c]  }t	        |       }}t        t        | |            S c c}w )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      )r   rz   ordr{   chrrx   rZ  )bscsnbs       rX   bytes_to_unicoder    s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[ B;IIaLIIdQhFA	
 	Q#a&	B	B 
s   C4c                   >    e Zd ZdZ	 	 	 	 d	dZdefdZd ZdefdZ	y)
TikTokenConverterz'
    A general tiktoken converter.
    Nc                     || _         || _        || _        t        |t              r|j                         | _        y || _        y r   )rb  r  r[   r  rx   r  extra_special_tokens)r   rb  r  r[   r  r   s         rX   r   zTikTokenConverter.__init__(  sD     % 0+56JD+Q %%' 	!Wk 	!rZ   tiktoken_urlc                 0   	 ddl m}  ||      t	               fd}g }i }j                         D ]  \  }}|| ||      <   t        |      dk(  r g }t        dt        |            D ]2  }	|d |	 ||	d  }}
|
v s|v s|
|z   v s|j                  |
||f       4 t        |fdd      }|j                  |        t        |d	 d      }|D cg c]  } ||d          ||d         f }}||fS # t        $ r t        d      w xY wc c}w )
Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	          dj                  | j                  d      D cg c]  }t        |          c}      S c c}w Nr%  zlatin-1rY  decoder  r  charbyte_encoders     rX   token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringB  s2    77@STLT3TUUT   <r   c                 $    | d      | d      fS ri   rj   )rk   r  s    rX   rm   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>P  s    1Q4)AaD/0R rZ   Fru   c                     | d   S Nrq   rj   rs   s    rX   rm   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>R  
    A rZ   )tiktoken.loadr  rt  
ValueErrorr  ry   rr   rz   r{   r|   r}   )r   r  r  r  r   rl   r   rankr   r   r   r   rt   r  r  s                @@rX   extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_model7  sY   	7 &l3	')	V $??, 
	!KE426E'./5zQEq#e*- ;#(%=%-i'Gy,@gPWFW\eEeLL'7D!9:; 5&R\abEMM% 
	! $6F\bcUX(Q02GA2OPccf}5  	k 	2 ds   C; D;Dc                     | j                  | j                        \  }}t        t        ||d            }t	        |j
                  d      rd|j
                  _        |S NF)r  ignore_mergesTr  rb  r   r   r   r   r  r   r~   r   r   s       rX   r   zTikTokenConverter.tokenizerV  N    #CCDOOTfc,GH	9??O4,0IOO)rZ   r\   c           
         | j                         }t        j                  t        j                  t	        | j
                        dd      t        j                  | j                  d      g      |_        t        j                         |_
        | j                  5|j                  | j                  D cg c]  }t        |dd       c}       t        j                  d      |_        |S c c}w )Nr=  Fr>  rA  Tr   r*  )r   r   rD  rE  r   r  r,  r[   r   r	   r   r  r  r   r   r   )r   r   r   s      rX   r   zTikTokenConverter.converted]  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$0((PTPiPijuEeTBj $.#7#7U#K	  ks   3C,Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)
r   r   r   r   r   r   r  r   r   r   rj   rZ   rX   r  r  #  s:      K!
C >9 rZ   r  c                   :    e Zd Z	 	 	 	 ddZdefdZd ZdefdZy)	MistralConverterNc                     || _         || _        || _        t        |t              r|j                         | _        y || _        y r   )rb  r  r[   r  rx   r  r   )r   rb  r  r[   r   r   s         rX   r   zMistralConverter.__init__r  sJ     % 0 3T: &**, 	& + 	&rZ   r  c                    dd l }dd l}t        | j                  dd      5 }|j	                  |      }d d d        d   d   | _        |d   D cg c]  }t        |d   |d	   
       c}| _        |d   }t               t        fd       }g }	i }
t        | j                        D ]  \  }}||
|j                  <    |D cg c]  }|j                  |d          }}t        |      }t        |      D ci c]  \  }}||
 c}}t        t        |d            D ]  \  }}||
 ||      <   t        |      dk(  r g }t!        dt        |            D ]2  }|d | ||d  }}||v s||v s||z   |v s|j#                  |||f       4 t%        |fdd      }|	j'                  |        t%        |	d d      }	|	D cg c]  } ||d          ||d         f }	}|
|	fS # 1 sw Y   xY wc c}w c c}w c c}}w c c}w )Nr   rzutf-8)encodingconfigr  r   	token_str
is_control)r   rl   c           	          dj                  | j                  d      D cg c]  }t        |          c}      S c c}w r  r  r  s     rX   r  zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s2    77@STLT3TUUTr  token_bytesz(Converting tekken.json to tokenizer.json)descr   c                 $    | d      | d      fS ri   rj   )rk   token_to_ranks    rX   rm   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s!    qt1DmTUVWTXFY0Z rZ   Fru   c                     | d   S r  rj   rs   s    rX   rm   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  r  rZ   )base64jsonr   rb  loadr  r   r   r  r   r   r  	b64decoderw   r   rr   rz   r{   r|   r}   )r   r  r  r  r   untypedkr  r  r   rl   idxr   rank_setr  r   r   r   r   rt   r  r  s                       @@rX   r  z0MistralConverter.extract_vocab_merges_from_model  s?   $//39 	#QiilG	#x(3IPQaIb*
DEJq~q?*
& G$	')		V 
	V #D$B$BC 	'JC#&E%-- 	'AJKAV%%a&67K	Ky>8A)8LMuM$T):d%ef 
	!KD%26E'./5zQEq#e*- ;#(%=%-h&7h+>GgDUZbCbLL'7D!9:; 5&ZdijEMM% 
	! $6F\bcUX(Q02GA2OPccf}C	# 	#*
 LM ds#   G'G4G9:G>H'G1c                     | j                  | j                        \  }}t        t        ||d            }t	        |j
                  d      rd|j
                  _        |S r  r  r  s       rX   r   zMistralConverter.tokenizer  r  rZ   r\   c                    | j                         }t        j                  t        j                  t	        | j
                        dd      t        j                  | j                  d      g      |_        t        j                         |_
        |j                  | j                         t        j                  d      |_        |S )Nr=  Fr>  rA  r*  )r   r   rD  rE  r   r  r,  r[   r   r	   r   ru  r   r   r   )r   r   s     rX   r   zMistralConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	T;;<#-#7#7U#K	 rZ   r  )	r   r   r   r   r   r  r   r   r   rj   rZ   rX   r  r  q  s6      K"&
"%C %N9 rZ   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerReformerTokenizerRemBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizerSplinterTokenizerXGLMTokenizer)LlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                 *   | j                   j                  }|t        v r!|st        |   } ||       j                         S | j                  j                  d      r?| | _        t        j                  d       t        | j                        j                         S 	 t        j                  d       t        | j                  | j                        j                         S # t        $ r* t        dt        t        j                                      w xY w)a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)rb  r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )rg  r   SLOW_TO_FAST_CONVERTERSr   rb  endswithrc   loggerinfor  r  r  rt  r  r   r  )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       rX   convert_slow_tokenizerr?    s      1::CC66}12FG45??AA		)	)	2	2=	A3H09: 5 @ @AKKMM	KK23$0;;%:%O%O ik  	>>BCZC_C_Ca>b=ce 	s   AC 3D)r%  r   )F)Wr   rd  collections.abcr   	functoolsr   	packagingr   
tokenizersr   r   r   r	   r
   r   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerr   r9  r  r  rY   boolr   re   r   r   r   r   r   r   r   r  r  r  r#  r3  r:  rG  rO  rV  r]  r  r  r  r  r  r  r
  r  r  r%  r*  r.  r2  r4  rC  rQ  rU  r_  rb  rg  rj  rm  rr  r  r  r  r  r  r  r  r  r7  r?  rj   rZ   rX   <module>rH     s    &   f f f 5 5  ` ` 5 
		H	%8 $ '  >G"$ s jo6L 01 1h"8 "Ic Id I$ $$I $N/	 /d$i $N$Y $N 6#I #Ly >)Y )Xy :$	 $Ny >H9 HV"
l "
J
| 
 - -`
 
BG\ GT-| -`,L ,^
< 
2,, ,^"
\ "
J	 	
| 
@	l 	%x| %xP, 8
L 
!y !H	
| 	
(I (V$) $N) :
L 
61
\ 1
h+\ +\#	 #L&m\ &mRV
l V
r- -`0K K\M M`88%8 (8 ]	8
 (8 .8 ,8 ]8 8 8 (8 ,8 =8 -8 "=8  !-!8" #8$ _%8& '8( ])8* (+8, -8. =/80 +182 -384 +586 $788 }98: *;8< n=8> (?8@ nA8B =C8D $E8F ]G8H ,I8J (K8L nM8N *O8P (Q8R (S8T *U8V 0W8X MY8Z ;[8\ ]]8^ (_8` .a8b nc8d *e8f ]g8h %($#o8 v$) $rZ   