
    ci*W                        d dl Z d dlmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZmZmZ d dlmZmZmZ d dlmZ d dlmZ  e j:                  d	       G d
 d             Z e j:                  d	       G d d             Z G d de      Z  e j:                  d	       G d d             Z!de"de#fdZ$de"dejJ                  jL                  jN                  de(fdZ) G d d      Z*dedeejV                     fdZ,dedee"   fdZ-defdZ.dede/fdZ0dedee"   fdZ1 G d d       Z2 G d! d"      Z3 G d# d$e3      Z4d%e"de4fd&Z5d%e"de3fd'Z6y)(    N)IterableListSequenceOptionalUnion)ContentsTypeImageTool	PartsType)_validate_contents_type_as_valid_sequence _content_types_to_gapic_contents_to_content)get_sentencepieceget_tokenizer_nameload_model_proto)contenttoolopenapi)sentencepiece_model_pb2)
struct_pb2T)frozenc                   <    e Zd ZU ee   ed<   ee   ed<   dZeed<   y)
TokensInfo	token_idstokensNrole)	__name__
__module____qualname__r   int__annotations__bytesr   str     H/tmp/pip-target-z3e9_cxr/lib/python/vertexai/tokenization/_tokenizers.pyr   r   1   s     }UOD#r%   r   c                   "    e Zd ZU dZee   ed<   y)ComputeTokensResulta  Represents token string pieces and ids output in compute_tokens function.

    Attributes:
        tokens_info: Lists of tokens_info from the input.
            The input `contents: ContentsType` could have
            multiple string instances and each tokens_info
            item represents each string instance. Each token
            info consists tokens list, token_ids list and
            a role.
    tokens_infoN)r   r   r   __doc__r   r   r!   r$   r%   r&   r(   r(   8   s    	 *%%r%   r(   c                       e Zd Zdee   fdZy)PreviewComputeTokensResultreturnc                 V    dd l }d}|j                  |t        d       | j                  S )Nr   zfPreviewComputeTokensResult.token_info_list is deprecated. Use ComputeTokensResult.tokens_info instead.   )
stacklevel)warningswarnDeprecationWarningr)   )selfr1   messages      r&   token_info_listz*PreviewComputeTokensResult.token_info_listI   s)    zg1a@r%   N)r   r   r   r   r   r6   r$   r%   r&   r,   r,   H   s     *!5  r%   r,   c                       e Zd ZU dZeed<   y)CountTokensResultzRepresents an token numbers output in count_tokens function.

    Attributes:
        total_tokens: number of total tokens.
    total_tokensN)r   r   r   r*   r    r!   r$   r%   r&   r8   r8   Q   s     r%   r8   tokenr-   c                    t        |       dk7  rt        d|        | j                  d      r| j                  d      st        d|        	 t	        | dd d      }|d
k\  rt        d|        |S # t        $ r t        d	|        w xY w)zParses a hex byte string of the form '<0xXX>' and returns the integer value.

    Raises ValueError if the input is malformed or the byte value is invalid.
       zInvalid byte length: z<0x>zInvalid byte format:          zInvalid hex value:    zByte value out of range: )len
ValueError
startswithendswithr    )r:   vals     r&   _parse_hex_byterG   \   s     5zQ0899E"%..*=08998%!*b! cz4UG<==J  8.ug6778s   A2 2B
typec                     |t         j                  j                  j                  j                  k(  rt        |       j                  dd      S | j                  dd      j                  d      S )N   big)length	byteorderu   ▁ zutf-8)	r   
ModelProtoSentencePieceTypeBYTErG   to_bytesreplaceencode)r:   rH   s     r&   _token_str_to_bytesrV   r   s\     &11??DDIIIu%..a5.II}}UC(//88r%   c                   R    e Zd ZdZdefdZdee   defdZdee   dee   de	fdZ
y	)
_SentencePieceAdaptorz<An internal tokenizer that can parse text input into tokens.tokenizer_namec                 D    t        |      | _        t        |      | _        y)z_Initializes the tokenizer.

        Args:
            name: The name of the tokenizer.
        N)r   _model_protor   
_tokenizerr4   rY   s     r&   __init__z_SentencePieceAdaptor.__init__~   s     -^<+N;r%   contentsr-   c                     | j                   j                  t        |            }t        t	        d |D                    S )z)Counts the number of tokens in the input.c              3   2   K   | ]  }t        |        y wN)rB   ).0r   s     r&   	<genexpr>z5_SentencePieceAdaptor.count_tokens.<locals>.<genexpr>   s     CVS[Cs   )r9   )r\   rU   listr8   sum)r4   r_   tokens_lists      r&   count_tokensz"_SentencePieceAdaptor.count_tokens   s5    oo,,T(^< C{CC
 	
r%   rolesc                   t        |      }| j                  j                  |      }t        |      }g }t        ||      D ]  \  }}|j	                  t        |j                  D cg c]  }|j                   c}|j                  D cg c]C  }t        |j                  | j                  j                  |j                     j                        E c}|              t        |      S c c}w c c}w )z7Computes the tokens ids and string pieces in the input.)r   r   r   r)   )re   r\   EncodeAsImmutableProtozipappendr   piecesidrV   piecer[   rH   r(   )	r4   r_   ri   content_listtokens_protostoken_infostokens_protor   rq   s	            r&   compute_tokensz$_SentencePieceAdaptor.compute_tokens   s     H~>>|LU"%mU"; 	L$5A5H5HIEuxxI
 &2%8%8	 " ,!KK):):)A)A%(()K)P)P 		 #{;; Js   $C*AC/N)r   r   r   r*   r#   r^   r   r8   rh   r(   rv   r$   r%   r&   rX   rX   {   sN    G<s <
Xc] 
7H 
<#C=<19#<	<r%   rX   r_   c                 H    t        |        t        |        t        |       }|S )zGConverts a GenerativeModel compatible contents type to a gapic content.)r   _assert_no_image_contents_typer   )r_   gapic_contentss     r&   _to_gapic_contentsrz      s$     .h7"8,5h?Nr%   c              #   n   K   t        |       }|D ]!  }|j                  D ]  }|j                    # y wrb   )rz   partsr   )r_   ry   r   parts       r&   _content_types_to_role_iteratorr~      s<     '1N! MM 	D,,	s   35c                     t        | t              s"t        | t              rt        d | D              rt	        d      yy)zBAsserts that the contents type does not contain any image content.c              3   <   K   | ]  }t        |t                y wrb   )
isinstancer	   rc   r   s     r&   rd   z1_assert_no_image_contents_type.<locals>.<genexpr>   s     Cw
7E*C   z-Tokenizers do not support Image content type.N)r   r	   r   anyrC   r_   s    r&   rx   rx      s;    (E"8X&C(CCHII D 	'r%   c                 n    t        | t              xs$ t        | t              xr t        d | D              S )Nc              3   <   K   | ]  }t        |t                y wrb   r   r#   r   s     r&   rd   z$_is_string_inputs.<locals>.<genexpr>   s     AW
7C(Ar   )r   r#   r   allr   s    r&   _is_string_inputsr      s5    8S! 	Bh) BAAAr%   c              #      K   t        | t              rd y t        | t              r*t        d | D              rdgt	        |       z  E d {    y t        |       E d {    y 7 7 w)Nuserc              3   <   K   | ]  }t        |t                y wrb   r   r   s     r&   rd   z&_to_canonical_roles.<locals>.<genexpr>   s      0%,
7C 0r   )r   r#   r   r   rB   r~   r   s    r&   _to_canonical_rolesr      s`     (C 	Hh	'C 0080 - 8c(m+++28<<< 	,<s$   A
A*A&A* A(!A*(A*c                   J   e Zd ZdZd Zdee   fdZdeee   ef   ddfdZ	de
j                  ddfd	Zd
ej                  ddfdZdeej                     ddfdZdej"                  dej"                  fdZdeej"                     ddfdZdeej(                     ddfdZdej(                  ddfdZdej.                  dej.                  fdZdej4                  dej4                  fdZdej:                  dej:                  fdZdej>                  dej>                  fdZ y)_TextsAccumulatoray  Accumulates texts from contents and tools.

    This class is used to accumulate countable texts from contents and tools.
    When user passes a unsupported fields that are added in the future, the new
    fields might be only counted in remote tokenizer. In this case, the local
    tokenizer should know that an unsupported new field exist in the content or
    tool instances and raise error to avoid returning incorrect result to users.

    The machanism to detect unsupported fields introduced in the future: when
    local tokenizer traversing the input instances, it is allowlist based text
    accumulation. When a field is traversed and evaluated to be countable, the
    value of this
    field is copied to two places: (1) self._texts for inputs to sentencepiece
    token count function. (2) a counted instance object in the recursive
    function's return value. That's to say, after done current recurssion,
    the instance(of same type as the input) only keeps the counted values.
    If user sets unsupported future proto fields, they can be detected by
    comparing the input instances equal to counted instances or not.
    c                     g | _         y rb   _textsr4   s    r&   r^   z_TextsAccumulator.__init__   s	    r%   r-   c                     | j                   S rb   r   r   s    r&   	get_textsz_TextsAccumulator.get_texts   s    {{r%   textsNc                     t        |t              r| j                  j                  |       y | j                  j	                  |       y rb   )r   r#   r   rn   extend)r4   r   s     r&   	add_textsz_TextsAccumulator.add_texts   s0    eS!KKu%KKu%r%   r   c                    t        j                         }|j                  D ]  }t        j                         }d|v sd|v rt	        d      d|v r|j
                  |_        d|v r,| j                  |j                         |j                  |_        d|v r,| j                  |j                         |j                  |_	        d|v r6|j                  |_
        | j                  j                  |j                         |j                  j                  |        |j                  |_        |j                  |j                  k7  rt	        d| d	| d
      y )N	file_datainline_dataz1Tokenizers do not support non-text content types.video_metadatafunction_callfunction_responsetextzHContent contains unsupported types for token counting. Supported fields . Got .)gapic_content_typesContentr|   PartrC   r   add_function_callr   add_function_responser   r   r   rn   r   _pb)r4   r   counted_contentr}   counted_parts        r&   add_contentz_TextsAccumulator.add_content   s=   -557MM 	7D.335Ld"mt&; !TUU4'.2.A.A+$&&&t'9'9:-1-?-?*"d***4+A+AB151G1G.~$(II!""499-!!((6	7   '||;;/---Z[jZkkqryqzz{|  .r%   r   c                 <   | j                   j                  |j                         t        j                  |j                        }| j                  |j                  j                        }||_        |j                  |j                  k7  rt        d| d| d      y)zProcesses a function call and adds relevant text to the accumulator.

        Args:
            function_call: The function call to process.
        )namezWFunction call argument contains unsupported types for token counting. Supported fields r   r   N)	r   rn   r   gapic_tool_typesFunctionCall_struct_traverser   argsrC   )r4   r   counted_function_callcounted_structs       r&   r   z#_TextsAccumulator.add_function_call  s     	=--. 0 = ==CUCU V..}/@/@/E/EF%3" $$(9(99ij  jA  AG  HU  GV  VW  X  :r%   function_callsc                 4    |D ]  }| j                  |        y rb   )r   )r4   r   r   s      r&   add_function_callsz$_TextsAccumulator.add_function_calls  s!     , 	2M""=1	2r%   r   c                     t        j                         }|j                  D ].  }| j                  |      }|j                  j	                  |       0 |j
                  |j
                  k7  rt        d| d| d      y )NzNTool argument contains unsupported types for token counting. Supported fields r   r   )r   r
   function_declarations_function_declaration_traversern   r   rC   )r4   r   counted_toolfunction_declarationcounted_function_declarations        r&   add_toolz_TextsAccumulator.add_tool%  s    ',,.$($>$> 	T +/+N+N$,( ..556RS		T
 txx'`am`nntuytzz{|  (r%   toolsc                 4    |D ]  }| j                  |        y rb   )r   )r4   r   r   s      r&   	add_toolsz_TextsAccumulator.add_tools1  s     	 DMM$	 r%   function_responsesc                 4    |D ]  }| j                  |        y rb   )r   )r4   r   r   s      r&   add_function_responsesz(_TextsAccumulator.add_function_responses5  s#     "4 	:&&'89	:r%   r   c                 F   t        j                         }| j                  j                  |j                         | j                  |j                  j                        }|j                  |_        ||_        |j                  |j                  k7  rt        d| d| d      y )Nz[Function response argument contains unsupported types for token counting. Supported fields r   r   )	r   FunctionResponser   rn   r   r   r   responserC   )r4   r   counted_function_responser   s       r&   r   z'_TextsAccumulator.add_function_response;  s     %5$E$E$G!,112../@/D/D/M/MN):)?)?!&-;!*$((,=,A,AAm  oH  nI  IO  Pa  Ob  bc  d  Br%   r   c                    t        j                         }| j                  j                  |j                         |j                  |_        |j
                  r6| j                  j                  |j
                         |j
                  |_        |j                  r"| j                  |j                        }||_        |j                  r"| j                  |j                        }||_        |S rb   )	r   FunctionDeclarationr   rn   r   description
parameters_schema_traverser   )r4   r   r   counted_parameterscounted_responses        r&   r   z0_TextsAccumulator._function_declaration_traverseH  s     (8'K'K'M$/445,@,E,E$)++KK3??@7K7W7W(4**!%!6!67K7V7V!W6H(3((#445I5R5RS4D(1++r%   schemac                 :   t        j                         }d|v r|j                  |_        d|v r|j                  |_        d|v r|j                  |_        d|v r6| j
                  j                  |j                         |j                  |_        d|v r6| j
                  j                  |j                         |j                  |_        d|v r6| j
                  j                  |j                         |j                  |_
        d|v r6| j
                  j                  |j                         |j                  |_        d|v r|j                  |_        d	|v r"| j                  |j                        }||_        d
|v rpi }|j                  j                         D ]6  \  }}| j
                  j                  |       | j                  |      }|||<   8 |j                  j!                  |       d|v r,| j#                  |j$                  j&                        }||_        |S )zProcesses a schema and adds relevant text to the accumulator.

        Args:
            schema: The schema to process.

        Returns:
            The new schema object with only countable fields.
        type_titledefaultformat_r   enumrequiredproperty_orderingitems
propertiesexample)r   SchemarH   r   r   r   rn   r   r   r   r   r   r   r   r   r   update_value_traverser   r   )	r4   r   counted_schemacounted_schema_itemsdkeyvaluecounted_valuecounted_schema_examples	            r&   r   z"_TextsAccumulator._schema_traverseY  s    !)f"(++Nf#)<<N %+^^N"KKv~~.%+^^N"F"KKv112)/););N&VKKv{{+"(++NKKv/&,ooN#&(/5/G/GN,f#'#8#8#F #7N 6!A$//557 '
U""3' $ 5 5e <&#' %%,,Q/%)%9%9&**:L:L%M"%;N"r%   structc                    t        j                         }| j                  j                  t	        |j
                  j                                      |j
                  j                         D ]T  \  }}| j                  |      }t        |t              r||j
                  |<   7|j
                  |   j                  |       V |S )zProcesses a struct and adds relevant text to the accumulator.

        Args:
            struct: The struct to process.

        Returns:
            The new struct object with only countable fields.
        )r   Structr   r   re   fieldskeysr   r   r   r#   	MergeFrom)r4   r   r   r   rF   counted_struct_fieldss         r&   r   z"_TextsAccumulator._struct_traverse  s     $**,4 2 2 456++- 	LHC$($8$8$=!/5-B%%c*%%c*445JK	L r%   r   c                 :   |j                  d      }t        j                         }|dk(  r8| j                  j	                  |j
                         |j
                  |_        |S |dk(  r8| j                  |j                        }|j                  j                  |       |S |dk(  rvt        j                         }|j                  j                  D ].  }| j                  |      }|j                  j	                  |       0 |j                  j                  |       |S )zProcesses a struct field and adds relevant text to the accumulator.

        Args:
            struct: The struct field to process.

        Returns:
            The new struct field object with only countable fields.
        kindstring_valuestruct_value
list_value)
WhichOneofr   Valuer   rn   r   r   r   r   	ListValuer   valuesr   )r4   r   r   r   r   counted_list_valueitems          r&   r   z!_TextsAccumulator._value_traverse  s    '"((*>!KKu112).););M&  ^#!2253E3EFN&&00@  \!!+!5!5!7((// @ $ 4 4T :"))00?@ $$../ABr%   )!r   r   r   r*   r^   r   r#   r   r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r$   r%   r&   r   r      s   (8C= &uXc]C%78 &T &#6#>#> 4 0/?/L/L QU 2&'7'D'DE2	2
-22 
7G7L7L 
 x(8(=(=>  4  :"*+;+L+L"M:	:!1!B!B	,$4$H$H,		-	-,"+w~~ +'.. +Zz'8'8 Z=N=N &Z%5%5 *:J:J r%   r   c            	       ^    e Zd ZdZdefdZddddedeed      d	ee	   d
e
fdZded
efdZy)	Tokenizerz,A tokenizer that can parse text into tokens.rY   c                 $    t        |      | _        y)zInitializes the tokenizer.

        Do not use this constructor directly. Use get_tokenizer_for_model instead.

        Args:
            name: The name of the tokenizer.

        N)rX   _sentencepiece_adapterr]   s     r&   r^   zTokenizer.__init__  s     '<N&K#r%   N)r   system_instructionr_   r   r
   r   r-   c                   t               }t        |      r|j                  |       n#t        |      }|D ]  }|j	                  |        |r|j                  d |D               |r7t        |      r|j                  |       n|j	                  t        |             | j                  j                  |j                               S )a  Counts the number of tokens in the text-only contents.

        Args:
            contents: The contents to count tokens for.
                Supports either a list of Content objects (passing a multi-turn
                conversation) or a value that can be converted to a single
                Content object (passing a single message).
                Supports
                * str, Part,
                * List[Union[str, Part]],
                * List[Content]
                Throws an error if the contents contain non-text content.
            tools: A list of tools (functions) that the model can try calling.
            system_instruction: The provided system instructions for the model.
                Note: only text should be used in parts and content in each part
                will be in a separate paragraph.

        Returns:
            A CountTokensResult object containing the total number of tokens in
            the contents.
        c              3   4   K   | ]  }|j                     y wrb   )	_raw_tool)rc   r   s     r&   rd   z)Tokenizer.count_tokens.<locals>.<genexpr>  s     'I4'Is   )
r   r   r   rz   r   r   r   r   rh   r   )r4   r_   r   r   text_accumulatorry   r   s          r&   rh   zTokenizer.count_tokens  s    : -.X&&&x0/9N) 6 ,,W56 &&'I5'IJ !34 **+=> ,,[9K-LM**778H8R8R8TUUr%   c                     t               }t        |      r|j                  |       n#t        |      }|D ]  }|j	                  |        | j
                  j                  |j                         t        |            S )a"  Computes the tokens ids and string pieces in the text-only contents.

        Args:
            contents: The contents to count tokens for.
                Supports either a list of Content objects (passing a multi-turn
                conversation) or a value that can be converted to a single
                Content object (passing a single message).
                Supports
                * str, Part,
                * List[Union[str, Part]],
                * List[Content]
                Throws an error if the contents contain non-text content.

        Returns:
            A ComputeTokensResult object containing the tokens ids and string
            pieces in the contents.

        Examples:
            compute_tokens(["hello world", "what's the weather today"])
            outputs:
            ComputeTokensResult(tokens_info=[TokensInfo(token_ids=[17534, 2134], tokens=[b'hello', b' world'], role='user'), TokensInfo(token_ids=[84264, 235341], tokens=[b'Goodbye', b'!'], role='user')], token_info_list=...The same as tokens_info)

        )r_   ri   )	r   r   r   rz   r   r   rv   r   r   )r4   r_   r   ry   r   s        r&   rv   zTokenizer.compute_tokens  s~    0 -.X&&&x0/9N) 6 ,,W56 **99%//1%h/ : 
 	
r%   )r   r   r   r*   r#   r^   r   r   r   r   r8   rh   r(   rv   r$   r%   r&   r   r     sn    6	Ls 	L )-26.V.V V%	.V
 %Y/.V 
.V`#
| #
8K #
r%   r   c                   (     e Zd Zdedef fdZ xZS )PreviewTokenizerr_   r-   c                 J    t        t        | 	  |      j                        S )Nrk   )r,   superrv   r)   )r4   r_   	__class__s     r&   rv   zPreviewTokenizer.compute_tokens  s#    ).x8DD
 	
r%   )r   r   r   r   r,   rv   __classcell__)r  s   @r&   r  r    s    
| 
8R 
 
r%   r  
model_namec                 D    | st        d      t        t        |             S a  Returns a tokenizer for the given tokenizer name.

    Usage:
        ```
        tokenizer = get_tokenizer_for_model("gemini-1.5-pro-001")
        print(tokenizer.count_tokens("Hello world!"))
        ```

    Supported models can be found at
    https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models.

    Args:
        model_name: Specify the tokenizer is from which model.
    zmodel_name must not be empty.)rC   r  r   r  s    r&    _get_tokenizer_for_model_previewr    s$     899.z:;;r%   c                 D    | st        d      t        t        |             S r
  )rC   r   r   r  s    r&   get_tokenizer_for_modelr  3  s$     899'
344r%   )7dataclassestypingr   r   r   r   r   -vertexai.generative_models._generative_modelsr   r	   r
   r   r   r   r   (vertexai.tokenization._tokenizer_loadingr   r   r   %google.cloud.aiplatform_v1beta1.typesr   r   r   r   r   sentencepiecer   google.protobufr   	dataclassr   r(   r,   r8   r#   r    rG   rO   rP   rQ   r"   rV   rX   r   rz   r~   rx   boolr   r   r   r   r  r  r  r$   r%   r&   <module>r     s        
 
 2 & d#  $ d#& & $& !4   d#  $3 3 ,99-88FFKK9
9*< *<Z	

%
%&l x} J\ J  =, =8C= =Y Yxa
 a
H
y 
< <9I <*5 5	 5r%   