
    鬜ij                     d    d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ  G d d      ZdefdZy	)
    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)bytes_to_unicode)PreTrainedTokenizerFastc                   >    e Zd ZdZ	 	 	 	 d	dZdefdZd ZdefdZ	y)
MistralConverterz'
    A general tiktoken converter.
    Nc                 <    || _         || _        || _        || _        y )N)vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   kwargss         ]/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/integrations/mistral.py__init__zMistralConverter.__init__   s"     
 0)B&    r   c                 &   |t               fd}g }i }t        j                               D ]  \  }\  }}|| j                  vr|| ||      <   t	        |      dk(  r1g }t        dt	        |            D ]2  }|d | ||d  }
}	|	v s|
v s|	|
z   v s|j                  |	|
|f       4 t        |fdd      }|j                  |       |||<    t        |d d      }|D cg c]  } ||d          ||d         f }}||fS c c}w )Nc           	          dj                  | j                  d      D cg c]  }t        |          c}      S c c}w )N zlatin-1)joindecodeord)bcharbyte_encoders     r   token_bytes_to_stringzOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   s2    77@STLT3TUUTs   <   c                 $    | d      | d      fS )Nr   r!    )x	bpe_rankss    r   <lambda>zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>-   s    Yqt_iPQRSPTo4V r   F)keyreversec                     | d   S )N   r#   )vals    r   r&   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>1   s
    A r   r   )	r	   	enumerateitemsr   lenrangeappendsortedextend)r   r   r    mergesidxtokenranklocalindexpiece_lpiece_rr+   r%   r   s               @@r   extract_vocab_merges_from_modelz0MistralConverter.extract_vocab_merges_from_model   sK   	')	V "+IOO,="> 	#C%D:::69+E23u:?"1c%j1 ?E',Ve}eEFmWG)+90D'T[J[`iIigw%=>? u*V`efe$"e	# $6F\bcUX(Q02GA2OPccf} ds   +Dc                     | j                  | j                        \  }}t        t        ||d            }t	        |j
                  d      rd|j
                  _        |S )NF)fuse_unkignore_mergesT)r;   r   r   r   hasattrmodelr>   )r   vocab_scoresr3   	tokenizers       r   rB   zMistralConverter.tokenizer5   sN    #CCDJJOfc,GH	9??O4,0IOO)r   returnc                    | j                         }t        j                  t        j                  t	        | j
                        dd      t        j                  | j                  d      g      |_        t        j                         |_
        |j                  | j                         t        j                  d      |_        |S )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rB   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rB   s     r   	convertedzMistralConverter.converted<   s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$T%C%CD#-#7#7U#K	 r   )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)
__name__
__module____qualname____doc__r   strr;   rB   r   rQ   r#   r   r   r   r      s;      K"&CS 69 r   r   tokenizer_filec                 H   ddl m} ddlm} |j	                  |       }|j
                  j                  j                  }t        |j
                  j                  j                  d       }|D cg c]  }|d   	 }}t        |      D ci c]  \  }}||
 }	}}|	j                  |       |	}|j
                  j                  j                  j                  }
t        t        |||
      j!                               }|j#                  d	|i       |j$                  j&                  |j(                  j&                  |j*                  j&                  |j,                  j&                  d
}|j/                         D ]  \  }}||v s|j#                  ||i        |S c c}w c c}}w )z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )SpecialTokens)MistralTokenizerc                     | d   S )Nr6   r#   )r$   s    r   r&   z*convert_tekken_tokenizer.<locals>.<lambda>X   s    mnoumv r   )r'   	token_str)r   r   r   )tokenizer_objectr   )	bos_token	eos_token	pad_token	unk_token)%mistral_common.tokens.tokenizers.baserY   (mistral_common.tokens.tokenizers.mistralrZ   	from_fileinstruct_tokenizerrB   _tekken_token2id_nospecialr1   _all_special_tokensr,   update_model_pat_strr
   r   rQ   rO   bosvalueeospadunkr-   )rW   rY   rZ   mistral_tokenizerr   sorted_tokensr5   all_specialr4   specials_tokensr   rB   
MAP_SPECALspecial_keyspecial_tokens                  r   convert_tekken_tokenizerrw   L   s    DI )22>B 00::UUE,??II]]cvwM3@A%5%AKA4=k4JKjc5uczKOK5!E  22<<CCLLG ();

)+I   "={!KL #&&,,"&&,,"&&,,"&&,,	J '1&6&6&8 G"]K'((+})EFG A BKs   .F
FN)
tokenizersr   r   r   r   r   tokenizers.modelsr   #transformers.convert_slow_tokenizerr	   *transformers.tokenization_utils_tokenizersr
   r   rV   rw   r#   r   r   <module>r|      s-    M M ! @ NA AH-S -r   