
    ciS?                         d Z ddlmZmZmZ ddlmZ ddlmZ  ej                  e
      ZdZdeeeef      fdZ G d d	ej                        Z G d
 de      Z G d de      Zy)zBMetric prompt template classes for model-based metrics evaluation.    )DictListOptional)base)prompt_template
elementsc                 n    | yt         j                  d t        | j                               D              S )z?Serializes dictionary to ordered string value without brackets. c              3   0   K   | ]  \  }}| d |   yw)z: N ).0keyvalues      Y/tmp/pip-target-z3e9_cxr/lib/python/vertexai/evaluation/metrics/metric_prompt_template.py	<genexpr>z*serialize_dict_in_order.<locals>.<genexpr>#   s     VzsEC55'*Vs   )_NEWLINEjoinsorteditems)r	   s    r   serialize_dict_in_orderr      s,    ==VVHNNDT=UVVV    c                       e Zd ZdZddddddeeef   deeef   dee   dee   deeeef      d	ee   d
eee      fdZe	defd       Z
y)_MetricPromptTemplatezBMetric prompt template for generic model-based metrics evaluation.N)instructionevaluation_stepsmetric_definitionfew_shot_examplescriteriarating_rubricinput_variablesr   r   r   r   c                    || _         || _        || _        || _        || _        || _        || _        | j                         | _        y)z%Initializes a metric prompt template.N)	_input_variables_instruction_metric_definition	_criteria_rating_rubric_evaluation_steps_few_shot_examples__str__template)selfr   r    r!   r   r   r   r   s           r   __init__z_MetricPromptTemplate.__init__)   sH     !0'"3!+!1"3r   returnc                     | j                   S )N)r+   r,   s    r   prompt_dataz!_MetricPromptTemplate.prompt_data@   s    }}r   )__name__
__module____qualname____doc__r   strr   r   r-   propertyr1   r   r   r   r   r   &   s    L &*59+/15' sCx.' CH~	'
 c' c]' #4S>2' $C=' $DI.'. S  r   r   c                        e Zd ZdZdddddddeeef   deeef   deee      dee   dee   d	eeeef      d
eee      f fdZdefdZ	deeef   fdZ
d Zd Z xZS )PointwiseMetricPromptTemplatezCPointwise metric prompt template for pointwise model-based metrics.Nr!   r   r   r   r   r   r    r!   r   r   r   r   c          	          |sg }t         j                  d       t        t        |dgz               }|xs | j	                         }|xs | j                         }t        |   |||||||       y)a  Initializes a pointwise metric prompt template.

        Args:
            criteria: The standards and measures used to evaluate the model
              responses. It is a dictionary of criterion names and criterion
              definitions.
            rating_rubric: A dictionary mapping of rating name and rating
              definition, used to assign ratings or scores based on specific
              criteria.
            input_variables: An optional list of input fields to use in the metric
              prompt template for generating model-based evaluation results. Model
              "response" column is included by default. If metric_column_mapping is
              provided, the mapping values of the input fields will be used to
              retrieve data from the evaluation dataset.
            instruction: The general instruction to the model that performs the
              evaluation. If not provided, a default pointwise metric instruction
              will be used.
            metric_definition: The optional metric definition. It is a string
              describing the metric to be evaluated at a high level. If not
              provided, this field will not be included in the prompt template.
            evaluation_steps: The optional gudelines of evaluation steps. A
              dictionary of evaluation step name and evaluation step definition. If
              not provided, a default pointwise metric evaluation steps will be
              used.
            few_shot_examples: The optional list of few-shot examples to be used in
              the prompt, to provide the model with demonstrations of how to perform
              the evaluation, and improve the evaluation accuracy. If not provided,
              this field will not be included in the prompt template.
        zsThe `input_variables` parameter is empty. Only the `response` column is used for computing this model-based metric.responser!   r   r    r   r   r   r   N)_LOGGERinfolistset!get_default_pointwise_instruction&get_default_pointwise_evaluation_stepssuperr-   	r,   r   r    r!   r   r   r   r   	__class__s	           r   r-   z&PointwiseMetricPromptTemplate.__init__H   s    P  OLLI s?j\#ABC!MT%K%K%M M K K M 	 	+'#/-/ 	 	
r   r.   c                      	 y)?Returns the default instruction for the metric prompt template.a   You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt and an AI-generated responses.
You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.r   r0   s    r   rB   z?PointwiseMetricPromptTemplate.get_default_pointwise_instruction   s    
	
r   c                     dddS )DReturns the default evaluation steps for the metric prompt template.zhAssess the response in aspects of all criteria provided. Provide assessment according to each criterion.zzScore based on the rating rubric. Give a brief rationale to explain your evaluation considering each individual criterion.)Step 1Step 2r   r0   s    r   rC   zDPointwiseMetricPromptTemplate.get_default_pointwise_evaluation_steps   s    ;R	
 		
r   c                    d| j                    t        dg}| j                  r |j                  d| j                   dg       |j                  dt	        | j
                         ddt	        | j                         dg       | j                  r)|j                  dt	        | j                         dg       | j                  r3|j                  dt        j                  | j                         dg       |j                  d	d
g       | j                  D ]"  }|dk(  r	|j                  d| d| dg       $ |j                  t        ddg       t        j                  |      S )z<Serializes the pointwise metric prompt template to a string.# Instruction# Evaluation## Metric Definitionr   ## Criteria## Rating Rubric## Evaluation Steps## Evaluation Examplesz(
# User Inputs and AI-generated Response## User Inputsr<   ### {}
z
## AI-generated Response
{response}r$   r   r%   extendr   r&   r'   r(   r)   r   r#   r,   metric_prompt_template_strinput_variables      r   r*   z%PointwiseMetricPromptTemplate.__str__   s      !	&
" ""&--*../r2 	#))*4>>:;2>"*4+>+>?@C		
 !!&--).t/E/EFGrJ ""&--,}}T%<%<=>bA 	#))8:JK	
 #33 	N+&-->*+(-	 	#)),	
 }}788r   c                 <    d| j                    d| j                   dS )Nz*PointwiseMetricPromptTemplate(prompt_data=, variables=)r1   	variablesr0   s    r   __repr__z&PointwiseMetricPromptTemplate.__repr__   s,    89I9I8J K..),	
r   )r2   r3   r4   r5   r   r6   r   r   r-   rB   rC   r*   rd   __classcell__rF   s   @r   r9   r9   E   s    M 04%)+/5915>
 sCx.>
 CH~	>

 "$s),>
 c]>
 $C=>
 #4S>2>
 $DI.>
@
3 

S#X 
89t
r   r9   c                        e Zd ZdZdddddddeeef   deeef   deee      dee   dee   d	eeeef      d
eee      f fdZdefdZ	deeef   fdZ
d Zd Z xZS )PairwiseMetricPromptTemplatezAPairwise metric prompt template for pairwise model-based metrics.Nr:   r   r    r!   r   r   r   r   c          	          |sg }t         j                  d       t        t        |ddgz               }|xs | j	                         }|xs | j                         }t        |   |||||||       y)aH  Initializes a pairwise metric prompt template.

        Args:
            criteria: The standards and measures used to evaluate the model
              responses. It is a dictionary of criterion names and criterion
              definitions.
            rating_rubric: A dictionary mapping of rating name and rating
              definition, used to assign ratings or scores based on specific
              criteria.
            input_variables: An optional list of input fields to use in the metric
              prompt template for generating model-based evaluation results.
              Candidate model "response" column and "baseline_model_response" column
              are included by default. If metric_column_mapping is provided, the
              mapping values of the input fields will be used to retrieve data from
              the evaluation dataset.
            instruction: The general instruction to the model that performs the
              evaluation. If not provided, a default pairwise metric instruction
              will be used.
            metric_definition: The optional metric definition. It is a string
              describing the metric to be evaluated at a high level. If not
              provided, this field will not be included in the prompt template.
            evaluation_steps: The optional gudelines of evaluation steps. A
              dictionary of evaluation step name and evaluation step definition. If
              not provided, a default pairwise metric evaluation steps will be used.
            few_shot_examples: The optional list of few-shot examples to be used in
              the prompt, to provide the model with demonstrations of how to perform
              the evaluation, and improve the evaluation accuracy. If not provided,
              this field will not be included in the prompt template.
        zThe `input_variables` parameter is empty. Only the `response` and `baseline_model_response` columns are used for computing this model-based metric.r<   baseline_model_responser=   N)r>   r?   r@   rA    get_default_pairwise_instruction%get_default_pairwise_evaluation_stepsrD   r-   rE   s	           r   r-   z%PairwiseMetricPromptTemplate.__init__   s    P  OLL,
 :/H"IIJ
 "LT%J%J%L L J J L 	 	+'#/-/ 	 	
r   r.   c                      	 y)rH   a  You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models. We will provide you with the user input and a pair of AI-generated responses (Response A and Response B).
You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on based on the Criteria provided in the Evaluation section below.
You will first judge responses individually, following the Rating Rubric and Evaluation Steps. Then you will give step by step explanations for your judgement, compare results to declare the winner based on the Rating Rubric and Evaluation Steps.r   r0   s    r   rk   z=PairwiseMetricPromptTemplate.get_default_pairwise_instruction+  s    	G	
r   c                     ddddddS )rJ   z-Analyze Response A based on all the Criteria.z-Analyze Response B based on all the Criteria.zcCompare the overall performance of Response A and Response B based on your analyses and assessment.zjOutput your preference of "A", "SAME" or "B" to the pairwise_choice field according to the Rating Rubrics.z9Output your assessment reasoning in the explanation field)rK   rL   zStep 3zStep 4zStep 5r   r0   s    r   rl   zBPairwiseMetricPromptTemplate.get_default_pairwise_evaluation_steps;  s&     FE4J R
 	
r   c                    d| j                    t        dg}| j                  r |j                  d| j                   dg       |j                  dt	        | j
                         ddt	        | j                         dg       | j                  r)|j                  dt	        | j                         dg       | j                  r3|j                  dt        j                  | j                         dg       |j                  d	d
g       | j                  D ]!  }|dv r|j                  d| d| dg       # |j                  g d       t        j                  |      S )z;Serializes the pairwise metric prompt template to a string.rN   rO   rP   r   rQ   rR   rS   rT   z)
# User Inputs and AI-generated ResponsesrU   )r<   rj   rV   rW   rX   )z
## AI-generated Responsesz### Response Az{baseline_model_response}
z### Response BrY   rZ   r\   s      r   r*   z$PairwiseMetricPromptTemplate.__str__K  s      !	&
" ""&--*../r2 	#))*4>>:;2>"*4+>+>?@C		
 !!&--).t/E/EFGrJ ""&--,}}T%<%<=>bA 	#))9;KL	
 #33 	N!HH&-->*+(-	 	#))	
 }}788r   c                 <    d| j                    d| j                   dS )Nz)PairwiseMetricPromptTemplate(prompt_data=r`   ra   rb   r0   s    r   rd   z%PairwiseMetricPromptTemplate.__repr__  s,    78H8H7I J..),	
r   )r2   r3   r4   r5   r   r6   r   r   r-   rk   rl   r*   rd   re   rf   s   @r   rh   rh      s    K 04%)+/5915A
 sCx.A
 CH~	A

 "$s),A
 c]A
 $C=A
 #4S>2A
 $DI.A
F
# 
 
tCH~ 
 :9x
r   rh   N)r5   typingr   r   r   google.cloud.aiplatformr   vertexai.evaluationr   Loggerr2   r>   r   r6   r   PromptTemplater   r9   rh   r   r   r   <module>rv      s|   " I ' ' (
 $++h
WhtCH~&> WO:: >]
$9 ]
@f
#8 f
r   