
    ciR              	          d Z ddlZddlZddlmZ ddlZddlZddlZddlZddl	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ  ej8                  e      ZdZdee ej6                  jB                        de"fdZ#	 d;deejH                     de%dee%   fdZ&	 d<de%de ejN                     de"dejP                  fdZ) G d dejT                        Z+ G d de+      Z, G d de+      Z- G d d e+      Z. G d! d"e+      Z/ G d# d$e+      Z0 G d% d&e+      Z1d' e1fd( e/fd) e,fd* e-fd+ e0fd, e.fgZ2 ed-e,e-e.e/e1e0      Z3d.d/d0ejh                  dee3e
f   fd1Z5d2ejl                  de7e%e
f   fd3Z8d4e e+   d5e ejr                     de ejP                     fd6Z: G d7 d8ejv                        Z<d9e<dejl                  fd:Z=y)=z*Handlers for computing evaluation metrics.    N)futures)AnyCallableOptionalTypeVarUnion)errors)_common)types)tqdm)override   )_evals_common)_evals_constant)evals   intermediate_eventsreturnc                     | sy| D ]]  }|j                   s|j                   j                  s'|j                   j                  D ]  }t        |d      s|j                  s  y _ y)z?Checks if any event in intermediate_events has a function call.Ffunction_callT)contentpartshasattrr   )r   eventparts      M/tmp/pip-target-z3e9_cxr/lib/python/vertexai/_genai/_evals_metric_handlers.py_has_tool_callr   *   s]    $  ==U]]00++  41d6H6H  
     r   warn_propertyc                    | r| j                   syd}d}g }| j                   D ]  }|j                  ddh      }|j                         D ]  \  }}|	||vs|j                  |         t	        |j
                  t              sct        |d      r't	        |j                  t              r|j                  rd}||j
                  z  } |r|rt        j                  d||       |r|S dS )	z?Extracts and concatenates all text parts from a Content object.N FtextthoughtexcludeTzWarning: content contains non-text parts: %s. Returning concatenated %s result from text parts. Inspect individual parts for full content.)r   
model_dumpitemsappend
isinstancer"   strr   r#   boolloggerwarning)	r   r   text_accumulatorany_text_part_foundnon_text_part_namespart_obj	part_dump
field_namefield_values	            r   _extract_text_from_contentr5   6   s    '--MM .''0C'D	'0'8 	;#J&%88'..z:	;
 hmmS)),x//6$$"&-." 2!  	
  3<<r   metric_nameeval_case_metric_resultscalculate_pass_ratec           	         g }d}d}d}|D ]X  }|j                   E|j                  9	 t        |j                        }|j                  |       |dz  }|r
|dk(  r|dz  }T|dz  }Z d}	d}
d}|dkD  r	 t        j                  |      }	|r||z  }|dkD  r	 t        j                  |      }
t        j                  | t        |      |||	|
|r|      S d      S # t        t
        f$ r* t        j                  d|j                  |        |dz  }Y w xY w# t        j                  $ r!}t        j                  d| |       Y d}~d}~ww xY w# t        j                  $ r!}t        j                  d| |       Y d}~d}~ww xY w)	z<Default aggregation logic using mean and standard deviation.r   Nr   g      ?zdCould not convert score '%s' to float for metric '%s' during default aggregation. Counting as error.z#Could not calculate mean for %s: %sz$Could not calculate stdev for %s: %s)r6   num_cases_totalnum_cases_validnum_cases_error
mean_scorestdev_score	pass_rate)error_messagescorefloatr(   
ValueError	TypeErrorr,   r-   
statisticsmeanStatisticsErrorstdevr   AggregatedMetricResultlen)r6   r7   r8   scores	num_error	num_validnum_passingresultrA   r=   r>   r?   es                r   _default_aggregate_scoresrQ   ]   s    FIIK* 'FLL,Dfll+e$Q	&5C<1$K NI#& JKI1}	R#0J #i/I1}	S$**62K ''45!!2)  9= ; 	* ?LL	 Q	$ )) 	RNN@+qQQ	R )) 	SNNA;PQRR	SsA   7C2D E
 5DDE&EE
E>E99E>c                       e Zd ZdZdddej
                  fdZej                  dej                  de
dej                  fd	       Zej                  d
eej                     dej                  fd       Zy)MetricHandlerz(Abstract base class for metric handlers.moduleevals.Evalsmetricc                      || _         || _        y NrT   rV   )selfrT   rV   s      r   __init__zMetricHandler.__init__   s    r   	eval_caseresponse_indexr   c                     t               )z9Processes a single evaluation case for a specific metric.NotImplementedError)rZ   r\   r]   s      r   get_metric_resultzMetricHandler.get_metric_result       
 "##r   r7   c                     t               )z4Aggregates the metric results for a specific metric.r_   rZ   r7   s     r   	aggregatezMetricHandler.aggregate   rb   r   N)__name__
__module____qualname____doc__r   Metricr[   abcabstractmethodEvalCaseintEvalCaseMetricResultra   listrI   re    r   r   rS   rS      s    2} ell  	$$9<$		#	#$ $ 	$(,U-G-G(H$		%	%$ $r   rS   c                       e Zd ZdZ eh d      Zdddej                  f fdZdej                  de
d	eeef   fd
Zedej                  de
d	ej                   fd       Zedeej                      d	ej&                  fd       Z xZS )ComputationMetricHandlerz'Metric handler for computation metrics.>   bleurouge_1exact_matchrouge_l_sumtool_call_validtool_name_matchtool_parameter_kv_matchtool_parameter_key_matchrT   rU   rV   c                     t         |   ||       | j                  j                  | j                  vr#t        d| j                  j                   d      y )NrY   Metric 'z#' is not supported for computation.)superr[   rV   nameSUPPORTED_COMPUTATION_METRICSrC   rZ   rT   rV   	__class__s      r   r[   z!ComputationMetricHandler.__init__   sX    v6;;4#E#EE4;;++,,OP  Fr   r\   r]   r   c                    i }|t        |j                        k\  r%t        d| dt        |j                         d      |j                  t        d|j                   d      |j                  |   }t        |j                         t        d| d|j                  xs d d      |j                  t        |j                  j                        t        d	|j                  xs d d      t        j                  d
|       | j                  j                  r| j                  j                  j                  d      rZd| j                  j                  dk(  rdndit        |j                        t        |j                  j                        dgd|d<   nSi t        |j                        t        |j                  j                        dgd|| j                  j                   d<   t        j                  d|       |S )5Builds the request parameters for evaluate instances.response_index " out of bounds for eval_case with  responses.)No responses found for eval_case with ID .$Response text missing for candidate  in eval_case 
Unknown ID%Reference text missing for eval_case zeval_case: %srouge
rouge_typerw   	rougeLsumrouge1)
prediction	reference)metric_spec	instancesrouge_input_inputzrequest_payload: %s)rJ   	responses
IndexErrorrC   eval_case_idr5   responser   r,   debugrV   r   
startswith)rZ   r\   r]   request_payloadcurrent_response_candidates        r   _build_request_payloadz/ComputationMetricHandler._build_request_payload   s.    S!4!455!.!1 2	++,-[:  &;I<R<R;SSTU  &/%8%8%H"%&@&I&IJR6~6F G**:l;1>  '))*=*=*F*FGO**:l;1>  	_i0;; 0 0 ; ;G D !'+{{'7'7='Hh  'A6??' &@%//88&		.OM*&  " 'A6??' &@%//88&		<Ot{{//078 	*O<r   c                 @   | j                   j                  }t        j                  d||j	                  d             | j
                  j                  | j                  ||            j	                  d      }t        j                  d|       d}|j                         D ]N  \  }}t        |t              s|s|j                         D ]"  \  }}t        |t              s|s|d   d   } N P t        j                  d	|       t        j                  ||
      S )zEProcesses a single evaluation case for a specific computation metric.z6ComputationMetricHandler: Processing '%s' for case: %sTexclude_nonemetric_configzresponse: %sNr   rA   zMetric result: %s)r6   rA   )rV   r   r,   r   r&   rT   evaluate_instancesr   r'   r)   dictrp   r   ro   )	rZ   r\   r]   r6   r   rA   _result_valuemetric_values	            r   ra   z*ComputationMetricHandler.get_metric_result  s    kk&&D  d 3	

 ;;1155iP 2 

*$*
' 	 	^X.'~~/ 	OA|,-,'3'9'9'; OA|!,5, ,Q 8	 	(%0))#
 	
r   r7   c                     t         j                  d| j                  j                         t	        | j                  j                  |      S )z7Aggregates the metric results for a computation metric.z.Aggregating results for computation metric: %sr,   r   rV   r   rQ   rd   s     r   re   z"ComputationMetricHandler.aggregate%  5    
 	Et{{GWGWX()9)9;STTr   )rf   rg   rh   ri   	frozensetr   r   rj   r[   rm   rn   r   r*   r   r   r   ro   ra   rp   rI   re   __classcell__r   s   @r   rs   rs      s    1$-
	
%!} ell @@9<@	c3h@D 

9<
		#	#
 
8 U(,U-G-G(HU		%	%U Ur   rs   c                       e Zd ZdZ eddh      Zdddej                  f fdZdej                  d	e
d
eeef   fdZedej                  d	e
d
ej                   fd       Zedeej                      d
ej&                  fd       Z xZS )TranslationMetricHandlerz'Metric handler for translation metrics.cometmetricxrT   rU   rV   c                     t         |   ||       | j                  j                  | j                  vr#t        d| j                  j                   d      y )NrY   r}   z#' is not supported for translation.)r~   r[   rV   r   SUPPORTED_TRANSLATION_METRICSrC   r   s      r   r[   z!TranslationMetricHandler.__init__3  sX    v6;;4#E#EE4;;++,,OP  Fr   r\   r]   r   c                    i }| j                   j                   d}d}t        | j                   d      r| j                   j                  }n7| j                   j                  dk(  rd}n| j                   j                  dk(  rd}d}d}t        | j                   d      r| j                   j                  }t        | j                   d	      r| j                   j
                  }|t        |j                        k\  r%t        d
| dt        |j                         d      |j                  t        d|j                   d      |j                  |   }t        |j                         t        d| d|j                  xs d d      |j                  t        |j                  j                        t        d|j                  xs d d      t        |j                        t        d|j                  xs d d      |||dt        |j                        t        |j                  j                        t        |j                        dd||<   |S )r   r   Nversionr   COMET_22_SRC_REFr   METRICX_24_SRC_REFsource_languagetarget_languager   r   r   r   r   r   r   r   r   z;Prompt text (source for translation) missing for eval_case )r   r   r   )r   r   sourcer   instance)rV   r   r   r   r   r   rJ   r   r   rC   r   r5   r   r   prompt)	rZ   r\   r]   r   metric_input_namer   r   r   r   s	            r   r   z/TranslationMetricHandler._build_request_payload;  sp    #{{//074;;	*kk))G[[((G[[**G4;; 12"kk99O4;; 12"kk99OS!4!455!.!1 2	++,-[: 
 &;I<R<R;SSTU  &/%8%8%H"%&@&I&IJR6~6F G**:l;1>  '))*=*=*F*FGO**:l;1>  &i&6&67?**:l;1>  ##2#2 9.77 8	8K8K8T8TU4Y5E5EF.
)* r   c           	      `   | j                   j                  }t        j                  d||       | j                  j                  | j                  ||            }t        j                  d|       d}d}	 |dk(  rQ|r#|j                  r|j                  j                  }nt        j                  d||r|j                  d      nd	       nU|d
k(  rP|r#|j                  r|j                  j                  }n+t        j                  d||r|j                  d      nd	       |-|s+t        j                  d||r|j                  d      nd	       t        j                  |||      S # t        $ r=}t        j                  d|||r|j                  d      nd	d       d| }Y d}~Yd}~ww xY w)zEProcesses a single evaluation case for a specific translation metric.z6TranslationMetricHandler: Processing '%s' for case: %sr   zAPI Response: %sNr   zFComet result missing in API response for metric '%s'. API response: %sTr   Noner   zHMetricX result missing in API response for metric '%s'. API response: %szJScore could not be extracted for translation metric '%s'. API response: %szSError processing/extracting score for translation metric '%s': %s. API response: %sexc_infozError extracting score: )r6   rA   r@   )rV   r   r,   r   rT   r   r   comet_resultrA   r-   model_dump_jsonmetricx_result	Exceptionerrorr   ro   )rZ   r\   r]   r6   api_responserA   r@   rP   s           r   ra   z*TranslationMetricHandler.get_metric_result  s   
 kk&&D	

 {{5555iP 6 
 	'65	;g%L$=$=(55;;ENN,#  , )88d8K!'	 	)L$?$?(77==ENN,#  , )88d8K!'	 }]( ( %44$4G#	2 ))#'
 	
  	;LL$ $ !00d0C   7qc:M	;s   5CE' '	F-03F((F-r7   c                     t         j                  d| j                  j                         t	        | j                  j                  |      S )z7Aggregates the metric results for a translation metric.z.Aggregating results for translation metric: %sr   rd   s     r   re   z"TranslationMetricHandler.aggregate  r   r   )rf   rg   rh   ri   r   r   r   rj   r[   rm   rn   r   r*   r   r   r   ro   ra   rp   rI   re   r   r   s   @r   r   r   .  s    1$-w	.B$C!} ell BB9<B	c3hBH M
M
9<M
		#	#M
 M
^ U(,U-G-G(HU		%	%U Ur   r   c                       e Zd ZdZdddej
                  f fdZdej                  dej                  de
eef   fd	Zdej                  dej                  de
eef   fd
Zde
eef   ddfdZdej                  dede
eef   fdZedej                  dedej&                  fd       Zedeej&                     dej,                  fd       Z xZS )LLMMetricHandlerzMetric handler for LLM metrics.rT   rU   rV   c                 (    t         |   ||       y )NrY   )r~   r[   r   s      r   r[   zLLMMetricHandler.__init__  s    v6r   r\   response_contentr   c                    |j                  dh      }|j                  d      }t        |t              st	        d|j
                   d      |j                  | j                  j                  i       }t        |t              r|j                  dg       }ng }t        |t              s8t        j                  d| j                  j                  |j
                  |       g }|D cg c]!  }t        j                  j                  di |# }}|j                  r|j                  j                  dd	
      gnd|j                  dd	
      g| j                  j                  d|D cg c]  }|j                  dd	
       c}iid}	| j                  j                  | j                  j                  d}
d|
d|	idiS c c}w c c}w )z1Builds the payload for a rubric-based LLM metric.r   r$   rubric_groupsz(Dataset column 'rubric_groups' for case z must be a dictionary.rubricszTRubrics for group '%s' in case %s is not a list: %s. Skipping rubrics for this case.jsonTmoder   N)r   r   r   )metric_prompt_templaterubric_group_keyrubric_based_metric_inputrubric_enhanced_contentsr   rq   )r&   getr)   r   rC   r   rV   rubric_group_namerp   r,   r-   r   r   Rubricr   prompt_template)rZ   r\   r   eval_case_dictrubric_groups_datarubric_group_from_datarubrics_listrparsed_rubricsr   metric_spec_payloads              r   _build_rubric_based_inputz*LLMMetricHandler._build_rubric_based_input  s    #--{m-D+//@,d3:9;Q;Q:R S( ( 
 "4!7!7KK))2"
 ,d3155iDLL,-NN2--&& L;GHa%++,,1q1HH ## !!,,&t,LM)44&t4TU--!/  &tD 0$
 $ '+kk&A&A $ = =
 (279QR*
 	
/ I s   %&F<%Gc                 
   |j                   |d}t        j                  | j                  j                        }|j
                  t        |j                               z
  }|D ]  }t        ||      st        ||      ||<     i }|j                         D ]2  \  }}	g }
t        |	t        j                        r|	g}
nt        |	t        j                        r|	j                  r|	j                  g}
nt        |	t               r&|	r#t        |	d   t        j                        r|	}
nt        |	d   t        j"                  j$                        rg }|	D ]X  }t'        |j(                        }|s|j(                  j*                  xs |j,                  xs d}|j/                  | d|        Z t        j                  t        j0                  dj3                  |            g      g}
nt        j                  t        j0                  t5        j6                  |	            g      g}
nt        |	t8              r@t        j                  t        j0                  t5        j6                  |	            g      g}
n5t        j                  t        j0                  t;        |	            g      g}
t        j<                  |
      ||<   5 t        j>                  t        j@                  |	      
      }d| j                  j                  i}| j                  jB                  d| j                  jB                  i|d<   | j                  jD                  r| j                  jD                  |d<   d||jG                  dd      diS )z7Builds the payload for a standard pointwise LLM metric.)r   r   r"   r   user: 
)r   contents)values)content_map_instancer   return_raw_outputcustom_output_format_configsystem_instructionpointwise_metric_inputr   Tr   r   )$r   r   PromptTemplaterV   r   	variablessetkeysr   getattrr'   r)   genai_typesContentResponseCandidater   rp   r   Messager5   r   roleauthorr(   Partjoinr   dumpsr   r*   ContentMapContentsPointwiseMetricInstance
ContentMapr   judge_model_system_instructionr&   )rZ   r\   r   instance_datatemplate_objrequired_varsvar_namecontent_map_valueskeyvaluecontent_list_to_serializehistory_textsmsg_objmsg_textr   instance_payloadr   s                    r   _build_pointwise_inputz'LLMMetricHandler._build_pointwise_input  s   
  &&(
 ++1L1LM$..]5G5G5I1JJ% 	GHy(+*1)X*Fh'	G  '--/ )	JC(*%%!4!45-2G)E5#:#:;>>160@-E4(UeAh(;(;<05-a%++*=*=>$&M#( H#=goo#N##*??#7#7#S7>>#SVD)00D6H:1FG	H $++#.#3#3=9Q#R"S1- $++#.#3#3E9J#K"L1-
 E4(''*//TZZ5FGH-)  ''{/?/?SZ/P.QR-) ',&>&>2's#O)	V !88!&!1!19K!L
  89T9TU;;((4#T[[%B%BB => ;;55::   45
 %2,77VRV7W'
 	
r   payloadNc                    i }| j                   j                  r| j                   j                  |d<   | j                   j                  r| j                   j                  |d<   | j                   j                  r| j                   j                  |d<   |syd|v r&|d   d   }d|vri |d<   |d   j	                  |       y||d<   y)	z:Adds autorater config to the request payload if specified.autorater_modelgeneration_configsampling_countNr   r   judge_autorater_configautorater_config)rV   judge_modeljudge_model_generation_configjudge_model_sampling_countupdate)rZ   r  r  specs       r   _add_autorater_configz&LLMMetricHandler._add_autorater_configj  s    +-;;""26++2I2I./;;4499 01 ;;11151W1W-.&'167FD't313-.)*112BC*:G&'r   r]   c                 X   |j                   r|t        |j                         k\  rt        d| d      |j                   |   j                  }|st	        d| d      | j
                  j                  r| j                  ||      }n| j                  ||      }| j                  |       |S )=Builds the request parameters for evaluate instances request.r    is out of bounds.'Response content missing for candidate r   )
r   rJ   r   r   rC   rV   r   r   r  r  )rZ   r\   r]   r   r  s        r   r   z'LLMMetricHandler._build_request_payload  s     ""nI<O<O8P&P~.>>PQRR$..~>GG9.9IK  ;;((44Y@PQG11)=MNG""7+r   c                 ~   | j                   j                  }	 | j                  ||      }| j                  j	                  |      }| j                   j
                  rE|j                  }t        j                  ||r|j                  nd|r|j                        S g       S |j                  }t        j                  ||r|j                  nd|r|j                        S d      S # t        $ rM}t        j                  d||j                   d       t        j                  |t#        |            cY d}~S d}~ww xY w)	z=Processes a single evaluation case for a specific LLM metric.r   N)r6   rA   rubric_verdicts)r6   rA   explanationz'Error processing metric %s for case %s.Tr   r6   r@   )rV   r   r   rT   r   r   rubric_based_metric_resultr   ro   rA   r%  pointwise_metric_resultr&  r   r,   r   r   r*   )rZ   r\   r]   r6   r  r   result_datarP   s           r   ra   z"LLMMetricHandler.get_metric_result  s7   
 kk&&	11)^LG{{55G5LH{{,,&AA11 +/:+++CNK$?$?  UW  '>>11 +/:+++;F 7 7  MQ 
  		LL9&&	   --'s1v 		s1   BC& C& !=C& C& &	D</AD71D<7D<r7   c                 j   | j                   j                  rt        | j                   j                        rt        j	                  d| j                   j
                         	 | j                   j                  |      }t        |t              st        d      t        |      }t        |D cg c]  }|j                  | c}      }||z
  }|||d}i ||}t        j                  d
d| j                   j
                  i|S t        j!                  d	| j                   j
                         t        | j                   j
                  |      S c c}w # t        $ rW}	t        j                  d| j                   j
                  |	d       t        | j                   j
                  |      cY d}	~	S d}	~	ww xY w)z/Aggregates the metric results for a LLM metric.z1Using custom aggregate_summary_fn for metric '%s'z.aggregate_summary_fn must return a dictionary.N)r:   r<   r;   r6   zeError executing custom aggregate_summary_fn for metric '%s': %s. Falling back to default aggregation.Tr   z-Using default aggregation for LLM metric '%s'rq   )rV   aggregate_summary_fncallabler,   infor   r)   r   rD   rJ   r@   r   rI   r   r   rQ   r   )
rZ   r7   custom_summary_dictr:   rO   r<   r;   required_fieldsfinal_summary_dictrP   s
             r   re   zLLMMetricHandler.aggregate  s   
 ;;++KK,,1
 KKCT[[EUEU%&*kk&F&F,'# ""5t<#$TUU"%&>"?"% '?"!//; # #2O"C'6'6'6#
 &P%O;N%O"33  $ 0 0(   LL?AQAQ -T[[-=-=?WXXC$  
<KK$$!   1KK$$&> 
s3   !A
E +E AE E 	F2AF-'F2-F2)rf   rg   rh   ri   r   	LLMMetricr[   rm   r   r   r   r*   r   r   r  r  rn   r   r   ro   ra   rp   rI   re   r   r   s   @r   r   r     s5   )7} 7eoo 7;
;
;F;N;N;
	c3h;
zM
M
;F;N;NM
	c3hM
^;T#s(^ ; ;.9<	c3h*   9< 		#	#   D 4Y(,U-G-G(H4Y		%	%4Y 4Yr   r   c                        e Zd ZdZdddej
                  f fdZedej                  de	dej                  fd	       Zed
eej                     dej                  fd       Z xZS )CustomMetricHandlerz"Metric handler for custom metrics.rT   rU   rV   c                 (   t         |   ||       | j                  j                  s#t	        d| j                  j
                   d      t        | j                  j                  t              s#t	        d| j                  j
                   d      y )NrY   zCustomMetricHandler for 'z*' needs  Metric.custom_function to be set.z:' needs  Metric.custom_function to be a callable function.)r~   r[   rV   custom_functionrC   r   r)   r   r   s      r   r[   zCustomMetricHandler.__init__  s    v6{{**+DKK,<,<+= >5 5  $++55x@+DKK,<,<+= >E E  Ar   r\   r]   r   c                    | j                   j                  }t        j                  d||j	                  d             |t        |j                        k\  r@t        j                  | j                   j                  d| d|j                  xs d d      S |j                  st        d	|j                   d
      |j                  |   }|j	                  dhdd      }|j	                  dd      j                  d      |d<   d}d}d}	 | j                   j                  r| j                   j                  |      }	t        |	t        j                        r|	S t        |	t              rd|	v r|	d   }|	j                  dd      }n@t        |	t        t         f      r|	}d}n%d| j                   j                   dt#        |	       }t        j                  | j                   j                  |||      S # t$        $ ro}
| j                   j                  rAt'        | j                   j                  d      r!| j                   j                  j(                  }nd}d| d|
 }d}d}Y d}
~
d}
~
ww xY w)z7Processes a single evaluation case for a custom metric.z1CustomMetricHandler: Processing '%s' for case: %sTr   r   z out of bounds for EvalCase r   r   r'  z	EvalCase z has no responses.r   r   )r%   r   r   r   r   NrA   r&  zCustomFunctionError(z): Returned unexpected type rf   unknown_custom_functionz): r6   rA   r&  r@   )rV   r   r,   r   r&   rJ   r   r   ro   r   rC   r   r6  r)   r   rB   rn   typer   r   rf   )rZ   r\   r]   r6   r   instance_for_custom_fn	error_msgrA   r&  custom_function_resultrP   custom_function_names               r   ra   z%CustomMetricHandler.get_metric_result  s   
 kk&&?  d 3	
 S!4!455-- KK,,%n%5 6!..>,?qB  ""y)?)?(@@RSTT%.%8%8%H"!*!5!5 MT "6 "
 .H-R-Rd .S .

#j/ 	z* 	 	{{**)-)D)D**& 4e6P6PQ115t<#9927;E"8"<"<]D"QK 6E2E"&K /t{{/J/J.K L,,01G,H+IK   ))((##	
 	
  		{{**w++Z0 (,{{'B'B'K'K$'@$./C.DCsKIEK		s!   AG1 A,G1 1	I):A%I$$I)r7   c                     t         j                  d| j                  j                         t	        | j                  j                  |      S )z2Aggregates the metric results for a custom metric.z)Aggregating results for custom metric: %sr   rd   s     r   re   zCustomMetricHandler.aggregateO  s5    
 	@$++BRBRS()9)9;STTr   )rf   rg   rh   ri   r   rj   r[   r   rm   rn   ro   ra   rp   rI   re   r   r   s   @r   r4  r4    s    ,} ell  J
J
9<J
		#	#J
 J
X U(,U-G-G(HU		%	%U Ur   r4  c                       e Zd ZdZdddej
                  f fdZedee	j                     deej                  j                     fd       Zed	ej                  deej                  j                     fd
       Zd	ej                  dedeeef   fdZed	ej                  dedej.                  fd       Zedeej.                     dej4                  fd       Z xZS )PredefinedMetricHandlerz&Metric handler for predefined metrics.rT   rU   rV   c                     t         |   ||       | j                  j                  t        j
                  vr#t        d| j                  j                   d      y )NrY   r}   z'' is not a supported predefined metric.)r~   r[   rV   r   r   SUPPORTED_PREDEFINED_METRICSrC   r   s      r   r[   z PredefinedMetricHandler.__init__[  sX    v6;;?#O#OO4;;++,,ST  Pr   r   r   c                     | syt         j                  j                  t         j                  j                  | g            S )zEConverts a genai_types.Content object to a types.InstanceData object.Nr   )r   r   InstanceDataInstanceDataContents)r   s    r   _content_to_instance_dataz1PredefinedMetricHandler._content_to_instance_datab  s=    
 {{''[[55y5I ( 
 	
r   r\   c                    | j                   s| j                  syd}d}d}g }g }| j                   r| j                   }|j                  r*t        j                  j                  |j                        }|j                  r|j                  }t        j                  j                  |      }|s|r!t        j                  j                  ||      }| j                  r/| j                  D cg c]  }|j                  r|j                   }}t        j                  j                  |      }t        j                  j                  ||      S c c}w )z3Converts an EvalCase object to an AgentData object.Nr   )tool)toolsdeveloper_instruction)r   )agent_configevents)
agent_infor   instructionr   r   rE  tool_declarationsToolsAgentConfigr   Events	AgentData)	r\   rJ  rK  rL  rP  event_contentsrN  r   rM  s	            r   _eval_case_to_agent_dataz0PredefinedMetricHandler._eval_case_to_agent_datam  sC   
 ##I,I,I $"--J%%(-(@(@#// )A )% ++$.$@$@!KK%%+<%=E-${{66*?  7  
 (( '::== N 
 ##.#9{{$$% % 
 	
s   )Er]   c           
         |j                   r|t        |j                         k\  rt        d| d      |j                   |   j                  }|st	        d| d      | j
                  j                  dk(  r5t        |j                        s t        j                  d|j                         d}|j                  r)t        j                  |j                  j                        }d}| j
                  j                  | j
                  j                  j                  d      rg }|j                   r,|j                   D ]  }|j#                  |j$                          |j&                  r|j#                  |j&                         t(        j*                  j-                  t(        j*                  j/                  |	      	      }nt        j                  |j&                        }i }t1        |d
      r|j2                  rt5        |j2                  t6              r.t(        j*                  j-                  |j2                        |d
<   nrt5        |j2                  t8        j:                        r#t        j                  |j2                        |d
<   n+t        j                  dt=        |j2                                t)        j>                  |t        j                  |      ||j@                  |rt)        jB                  |      ndt        jE                  |            }	d|	i}
i }| j
                  jF                  r| j
                  jF                  |d<   | j
                  jH                  r| j
                  jH                  |d<   | j
                  jJ                  r| j
                  jJ                  |d<   |rt9        jL                  di ||
d<   |
S )r!  r   r"  r#  r   tool_use_quality_v1zsMetric 'tool_use_quality_v1' requires tool usage in 'intermediate_events', but no tool usage was found for case %s.N
multi_turnr   contextr   zUnsupported type for context: )map_instance)r   r   r   r   
other_data
agent_datar   r  r  r  r  rq   )'r   rJ   r   r   rC   rV   r   r   r   r,   r-   r   r   rA  rG  r   conversation_historyr(   r   r   r   r   rE  rF  r   rZ  r)   r*   r   r   r:  EvaluationInstancer   MapInstancerV  r  r  r  AutoraterConfig)rZ   r\   r]   r   reference_instance_dataprompt_instance_dataprompt_contentsmessageother_data_mapr  r   r  s               r   r   z.PredefinedMetricHandler._build_request_payload  sW    ""nI<O<O8P&P~.>>PQRR$..~>GG9.9IK  ;;44!)"?"?@V** #'&=&W&W##,,'#  $;;'DKK,<,<,G,G,U O--(== <G#**7??;<&&y'7'78#(;;#;#;99?9S $< $  $;#T#T  $  *,9i(Y->->)++S1,1KK,D,D"** -E -y) I--{/B/BC+EEiFWFWX y) 4T):K:K5L4MN !33',FF  .#11 " !!~>.GG	R
  (+
 ,.;;""26++2I2I./;;4499 01 ;;11151W1W-.2=2M2M 3"3O./ r   c           
         | j                   j                  }	 | j                  ||      }t        t              D ]K  }	 | j
                  j                  | j                   g|j                  d      |j                  d            } n rt#        |d      r|j$                  rz|j$                  d   }d}	|j&                  r%t)        |j&                  d      rd|j&                   }	t        j                  ||j*                  |j,                  |j.                  |	      S t        j'                  d||r|j1                  d      nd       t        j                  |d
      S # t        j                  $ r}|j                  dk(  rpt        j                  d|dz   t        |d|z         |t        dz
  k(  r(t        j                  |dt         d	| 
      cY d}~c S t        j                   d|z         n|Y d}~d}~ww xY w# t2        $ rN}t        j'                  d||j4                  |d       t        j                  |t7        |      
      cY d}~S d}~ww xY w)zDProcesses a single evaluation case for a specific predefined metric.r   r  )metricsr   r    HResource Exhausted error on attempt %d/%d: %s. Retrying in %s seconds...r      z%Judge model resource exhausted after 
 retries: r'  Nmetric_resultsr   codeError in metric result: )r6   rA   r&  r%  r@   zSMetric results missing in API response for predefined metric '%s'. API response: %sTr   r   'Metric results missing in API response.z*Error processing metric %s for case %s: %sr   )rV   r   r   range_MAX_RETRIESrT   _evaluate_instancesr   genai_errorsClientErrorrn  r,   r-   r   ro   timesleepr   rm  r   r   rA   r&  r%  r   r   r   r*   
rZ   r\   r]   r6   r  attemptr   rP   r*  r@   s
             r   ra   z)PredefinedMetricHandler.get_metric_result  sS   
 kk&&G	11)^LG .   #';;#B#B!%!(Z!8)05G)H $C $L
  8 L*:; //*99!< $$$1B1BF)K&>{?P?P>Q$RM11 +%++ + 7 7$/$?$?"/  ( ( %44$4G#	 11 +"K _ $//  vv}*#aK(wJ #lQ&66#(#=#=,70UVbUccmnomp.q$  

1g:. / f  
	LL<&&   --'s1v 
	sc   $G? AEBG? AG? G<0AG7G<G? G71G? 7G<<G? ?	IAIIIr7   c                     t         j                  d| j                  j                         t	        | j                  j                  |d      S )z6Aggregates the metric results for a predefined metric.z-Aggregating results for predefined metric: %sTr8   r   rd   s     r   re   z!PredefinedMetricHandler.aggregateB  s<    
 	DdkkFVFVW(KK6D
 	
r   )rf   rg   rh   ri   r   rj   r[   staticmethodr   r   r   r   rE  rG  rm   rT  rV  rn   r   r*   r   r   r   ro   ra   rp   rI   re   r   r   s   @r   rA  rA  X  s2   0} ell  
+--.
	%++**	+
 
 '
>>'
	%++''	('
 '
RZZ9<Z	c3hZx LL9<L		#	#L L\ 
(,U-G-G(H
		%	%
 
r   rA  c                        e Zd ZdZdddej
                  f fdZdej                  dede	e
ef   fd	Zedej                  dedej                  fd
       Zedeej                     dej"                  fd       Z xZS ) CustomCodeExecutionMetricHandlerz1Metric handler for custom code execution metrics.rT   rU   rV   c                     t         |   ||       | j                  j                  s#t	        d| j                  j
                   d      y )NrY   z&CustomCodeExecutionMetricHandler for 'z1' needs  Metric.remote_custom_function to be set.)r~   r[   rV   remote_custom_functionrC   r   r   s      r   r[   z)CustomCodeExecutionMetricHandler.__init__P  sQ    v6{{1189I9I8J K< <  2r   r\   r]   r   c                    |j                   r|t        |j                         k\  rt        d| d      |j                   |   j                  }|st	        d| d      d}|j
                  r)t        j                  |j
                  j                        }t        j                  |j                        }t        j                  |t        j                  |      |      }d|iS )r!  r   r"  r#  r   N)r   r   r   r   )r   rJ   r   r   rC   r   rA  rG  r   r   r_  )rZ   r\   r]   r   rb  rc  r  s          r   r   z7CustomCodeExecutionMetricHandler._build_request_payloadY  s     ""nI<O<O8P&P~.>>PQRR$..~>GG9.9IK  #'&=&W&W##,,'#  7PP 
 !33',FF  .
 (
 	
r   c           
      z   | j                   j                  }	 | j                  ||      }t        t              D ];  }	 | j
                  j                  | j                   g|j                  d            } n rt#        |d      r{|j$                  ro|j$                  d   }d
}	|j&                  r%t)        |j&                  d      rd|j&                   }	t        j                  ||j*                  |j,                  |	      S t        j'                  d||r|j/                  d      nd       t        j                  |d	      S # t        j                  $ r}|j                  dk(  rpt        j                  d|dz   t        |d|z         |t        dz
  k(  r(t        j                  |dt         d| 	      cY d
}~c S t        j                   d|z         n|Y d
}~d
}~ww xY w# t0        $ rM}t        j'                  d||j2                  d       t        j                  |t5        |      	      cY d
}~S d
}~ww xY w)zOProcesses a single evaluation case for a specific custom code execution metric.r   )rh  r   ri  rj  r   rk  zResource exhausted after rl  r'  Nrm  r   rn  ro  r9  zHMetric results missing in API response for metric '%s'. API response: %sTr   r   rp  z&Error processing metric %s for case %sr   )rV   r   r   rq  rr  rT   rs  r   rt  ru  rn  r,   r-   r   ro   rv  rw  r   rm  r   r   rA   r&  r   r   r   r*   rx  s
             r   ra   z2CustomCodeExecutionMetricHandler.get_metric_result|  s8   
 kk&&D	11)^LG .   #';;#B#B!%!(Z!8 $C $L  6 L*:; //*99!< $$$1B1BF)K&>{?P?P>Q$RM11 +%++ + 7 7"/	  ( ( %44$4G#	 11 +"K ] $//  vv}*#aK(wJ #lQ&66#(#=#=,70I,Wabcad.e$  

1g:. / d  		LL8&&	   --'s1v 		sb   $G$ 7E4BG$  AG$ G!AG3G!4G$ ;GG$ G!!G$ $	H:-AH5/H:5H:r7   c                     t         j                  d| j                  j                         t	        | j                  j                  |d      S )zAAggregates the metric results for a custom code execution metric.z8Aggregating results for custom code execution metric: %sTr{  r   rd   s     r   re   z*CustomCodeExecutionMetricHandler.aggregate  s@    
 	FHXHX	
 )KK6D
 	
r   )rf   rg   rh   ri   r   rj   r[   rm   rn   r   r*   r   r   r   ro   ra   rp   rI   re   r   r   s   @r   r~  r~  M  s    ;} ell !
!
9<!
	c3h!
F II9<I		#	#I IV 	
(,U-G-G(H	
		%	%	
 	
r   r~  c                 6    t        | d      xr | j                  S )Nr  )r   r  ms    r   <lambda>r    s    '!56S1;S;S r   c                 R    | j                   xr t        | j                   t              S rX   )r6  r)   r   r  s    r   r  r    s    !##O
13D3Dh(O r   c                 :    | j                   t        j                  v S rX   )r   rs   r   r  s    r   r  r        !&&4RRR r   c                 :    | j                   t        j                  v S rX   )r   r   r   r  s    r   r  r    r  r   c                 :    | j                   t        j                  v S rX   )r   r   rC  r  s    r   r  r    s    !&&OHHH r   c                 6    t        | t        j                        S rX   )r)   r   r2  r  s    r   r  r    s    z!U__- r   MetricHandlerTyperT   rU   rV   c                 v    t         D ]  \  }} ||      s || |      c S  t        d|j                         )z.Returns a metric handler for the given metric.rY   zUnsupported metric: )_METRIC_HANDLER_MAPPINGrC   r   )rT   rV   	conditionhandler_classs       r   get_handler_for_metricr    sF     %< ? 	=V v>>? +FKK=9
::r   eval_resultc                    | j                   si S t        d | j                   D        d      dk(  ri S t        j                  fd      }| j                   D ]1  }|j                  st        j                  t
              }t        |j                        D ]c  \  }}|j                  r|j                  j                         ni D ]3  \  }}|j                  ||   j                  |j                  |d       5 e |j                         D ]z  \  }}|s	||   dxx   dz  cc<   t        d |D              }	|D 
cg c]  }
|
d	   |	k(  s|
d
    }}
t        |      dk(  r||   d   |d   xx   dz  cc<   k||   dxx   dz  cc<   | 4 i }|j                         D ]6  \  }}|d   dkD  s|d   D cg c]
  }||d   z   c}|d   |d   z  d||<   8 |S c c}
w c c}w )z0Calculates win/tie rates for comparison results.c              3   ^   K   | ]%  }|j                   rt        |j                          ' y wrX   )response_candidate_resultsrJ   ).0cases     r   	<genexpr>z&calculate_win_rates.<locals>.<genexpr>  s,      	
.. //0	
s   +-r   )defaultc                      dg z  dddS )Nr   )winstiesvalid_comparisonsrq   )
max_modelss   r   r  z%calculate_win_rates.<locals>.<lambda>  s    !z)11M r   )rA   cand_idxr  r   c              3   &   K   | ]	  }|d      yw)rA   Nrq   )r  ss     r   r  z&calculate_win_rates.<locals>.<genexpr>!  s     71AgJ7s   rA   r  r  r  )	win_ratestie_rate)eval_case_resultsmaxcollectionsdefaultdictr  rp   	enumeraterm  r'   rA   r(   rJ   )r  statsr  scores_by_metricidx	candidater   resrK   	max_scorer  winnersr  metric_statswr  s                  @r   calculate_win_ratesr    s2   ((		
#55	

 J Q	:E:Q:QM;E -- )..&2248'(G(GH 	YNC4=4L4L	((..0RTY	c 99($T*11CIISV2WX	Y	Y -224 		)LD&$K+,1,777I.4P'
i8Oq}PGP7|q dF#GAJ/14/dF#q(#		))( I#kkm l+,q0 DPPVCW>?A%899 )0<@S3TT	IdO  Qs   =GG6G metric_handlersr  c                 `   g }t         j                  d       | D ]  }|j                  j                  }g }|D ]k  }|j                  s|j                  D ]M  }|j
                  s||j
                  v st        |t              s0|j                  |j
                  |          O m |st         j                  d|       	 |j                  |      }|j                  |        t         j'                  d
|       |S # t        $ r. t         j                  dt        |      j                  |       Y t        $ rt}	t         j                  d|t        |      j                  |	d       |j                  t!        j"                  |t%        |      dt%        |      dd	             Y d}	~	d}	~	ww xY w)zCAggregates results by calling the aggregate method of each handler.z!Aggregating results per metric...z.No results found for metric '%s' to aggregate.zTAggregation not implemented for metric handler: %s (metric: '%s'). Skipping summary.z=Error during aggregation for metric '%s' using handler %s: %sTr   r   N)r6   r:   r;   r<   r=   r>   z#Finished aggregation, returning: %s)r,   r.  rV   r   r  rm  r)   r*   r(   r-   re   r`   r:  rf   r   r   r   rI   rJ   r   )
r  r  aggregated_metric_resultshandlerr6   results_for_this_metriccase_resultresponse_candidate_ressummaryrP   s
             r   _aggregate_metric_resultsr  3  s   
 !#
KK34" /nn))DF, 
	K55.9.T.T *.=='+A+P+PP&{C8/662AA+N
	 'NN@+ 	''(?@G%,,W5-/` LL68QR$$5 # 	NN%W&&	  	LLOW&&   &,,,, +$'(?$@$%$'(?$@# $	 		s   ?"C;;3F-1F-9A)F((F-c                   n    e Zd ZU dZeed<   	 ej                  ed<   	 eej                     ed<   	 e
ed<   y)EvaluationRunConfigz$Configuration for an evaluation run.evals_moduledatasetrh  num_response_candidatesN)rf   rg   rh   ri   r   __annotations__r   EvaluationDatasetrp   rj   rn   rq   r   r   r  r  n  s8    .7$$$8%,,@  Cr   r  evaluation_run_configc                     g }g }t        j                  d       }g }g }t               }| j                  D ]'  }|j	                  t        | j                  |             ) t        | j                  j                        }t        j                  d|       t        j                  d| j                         |t        |      z  | j                  z  }	t        j                  d|	       t        |	d      5  t        j                  t         j"                        5 }
|D ]  }t%        | j                  j                        D ]  \  }t'        | j                  t        |j(                              }t+        |      D ]  }	 |
j-                  |j.                  ||      }|j1                   fd       t        j3                  d	||j4                  j6                         |j	                  ||j4                  j6                  |f          	 ddd       |D ]T  \  }}}	 |jE                         }t        j3                  d|||       ||   |   |<   t        j3                  d||       V 	 ddd       g }tG        |jI                               }|D ]  |   }g }tG        |jI                               }|D ]/  }||   }t=        jJ                  ||      }|j	                  |       1 |r)t=        jL                  |      }|j	                  |       |v stO        fd|D              st        jQ                  d       t=        jL                  g       }|j	                  |        |r5t        jQ                  dt        |             t        jQ                  d|       |r5t        jQ                  dt        |             t        jQ                  d |       tS        ||      }t=        jT                  ||!      }| j                  dkD  r	 tW        |      |_,        |S |S # t8        $ r}t        j;                  d
||j4                  j6                  |d       |j	                  |j4                  j6                  |d| f       t=        j>                  |j4                  j6                  d|       }||   |   |j4                  j6                  <   |jA                          jC                  d       Y d}~d}~ww xY w# 1 sw Y   xY w# t8        $ r|}t        j;                  d|||d       d| d d| d| }|j	                  |||f       |jA                         t=        j>                  ||      }||   |   |<   Y d}~{d}~ww xY w# 1 sw Y   -xY w# t8        $ r#}t        j;                  d"|d       Y d}~|S d}~ww xY w)#zGComputes metrics and aggregates them for a given evaluation run config.c                  4    t        j                  t              S rX   )r  r  r   rq   r   r   r  z/compute_metrics_and_aggregate.<locals>.<lambda>  s    (?(?(E r   z$Total number of evaluation cases: %dz!Number of response candidates: %dz'Total number of metric computations: %dz(Computing Metrics for Evaluation Dataset)totaldesc)max_workersc                 &    j                  d      S )Nr   )r  )r   pbars    r   r  z/compute_metrics_and_aggregate.<locals>.<lambda>  s    t{{1~ r   zESubmitting metric computation for case %d, response %d for metric %s.zNError submitting metric computation for case %d, response %d for metric %s: %sTr   zError: zSubmission Error: r'  r   NzGSuccessfully obtained result for metric '%s', case %d, response %d: %s.z4Stored result for metric '%s', case %d, response %d.z8Error executing metric '%s' for case %s, response %s: %szError executing metric 'z' for case z, response r   )r]   rm  )eval_case_indexr  c              3   2   K   | ]  \  }}}}|k(    y wrX   rq   )r  r   err_case_idxr  s      r   r  z0compute_metrics_and_aggregate.<locals>.<genexpr>+  s(      @
%<A O+@
s   zOEvalCase %d had errors but no metric results were processed into the structure.z!Encountered %d submission errors.zSubmission errors: %sz Encountered %d execution errors.zExecution errors: %s)r  summary_metricszError calculating win rates: %s)-r  r  r   rh  r(   r  r  rJ   r  
eval_casesr,   r.  r  r   r   ThreadPoolExecutorr   MAX_WORKERSr  minr   rq  submitra   add_done_callbackr   rV   r   r   r   r   ro   addr  rO   sortedr   ResponseCandidateResultEvalCaseResultanyr-   r  EvaluationResultr  r  )!r  r  all_futuresresults_by_case_response_metricsubmission_errorsexecution_errorscase_indices_with_errorseval_metriceval_case_counttotal_metric_computationsexecutormetric_handler_instancer\   actual_num_candidates_for_caser]   futurerP   error_resultr6   eval_case_metric_resultr<  final_eval_case_resultssorted_eval_case_indices"per_response_results_for_this_case'current_response_candidate_results_listsorted_response_indices metric_results_for_this_responseresponse_candidate_result_objeval_case_resultr  r  r  r  s!                                  @@r   compute_metrics_and_aggregater  {  sz    OK 	 EF $ "u,44 
"#8#E#E{S	


 /77BBCO
KK6H
KK+55
 	
o
	

7
7	8 
 KK9;TU	'7
 r! 
''%11
 <	++: 9+'2;)11<<3 8+.OY 69-EEI//062 +00N*O 1+0+%-__ 7 I I ) .&F
 #445MN"LL!= / . 7 > > C C (..$*$;$B$B$G$G$3$2	!"1+8+9+<	+| EP 0	!@FK./!*0--/'#"+ , 0@P J#"	0	!Er!h !%&E&J&J&LM3 (=-L.
* 35/"()K)P)P)R"S5 	N/Q0, -2,I,I-?-) 4::-	 3$33 /+R  $**+;< 88C @
):@
 =
 NN1
  %33 /+-  $**+;<Q(=T :C@Q<RS.0AB93?O;PQ-/?@ 90! ((11K 44q8	N$7$DK! ;[  ) +"LL!@ / . 7 > > C C !)- )  .44$;$B$B$G$G$3$2&-aSM	!" ,1+E+E,C,J,J,O,O0B1#.F,L FR <OL .5<<AAC 588I KKNN5+E<	+ <	+d  !N#"!   /{m <'(N3C2aSJ  !''#'&!	 ),,_=$99 +"+  ! 0@P5!mr! r!b  	NLL:ALMM	Ns   4%VA"T<BP 
T	VAT(VV#  	S>)C
S93T9S>>TT	V	VA1VVVVV #	W,W

Wr   )F)>ri   rk   r  
concurrentr   r   loggingrE   rv  typingr   r   r   r   r   google.genair	   rt  r
   r   r   r   typing_extensionsr   r!   r   r   r   	getLoggerrf   r,   rr  rp   Eventr+   r   r   r*   r5   ro   rI   rQ   ABCrS   rs   r   r   r4  rA  r~  r  r  rj   r  r  r   r  r  r  	BaseModelr  r  rq   r   r   <module>r     s   1 
       : : /   -  &     
		8	$	ekk6G6G1H(I 	d 	 BH$=k))*$=;>$=c]$=T !&88"5#=#=>8 8 !!	8v$CGG $,}U} }U@gU} gUTVY} VYrdU- dUNr
m r
jE
} E
T 	T(
 	P
 	S 
 	S 
 	I ./?@+ 0 $ ;;#(<<;
c!";.U%;%; .S#X .b8%-(8%E0018% 
%
&
&'8%v
D'++ 
DR.R
Rr   