
     i                    L   d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZej                  j                  d       ddlmZ ddlZddlZddlZddlZddlZddlZddlmZmZ ddlmZmZ ddlmZmZmZ dd	l m Z m!Z! dd
l"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3  ejh                  ejj                  d        ejl                  d      Z7 e2d      Z8e9Z:e9Z;e9Z< G d de#      Z= G d de#      Z> G d de#      Z? G d de#      Z@e G d d             ZAe G d d             ZBe G d d             ZC G d d       ZD G d! d"      ZE G d# d$      ZF G d% d&      ZG G d' d(      ZH G d) d*      ZId+ ZJeKd,k(  r ej                   eJ              yy)-a  
rlm_01_preference_learning.py - Preference Learning System for AIVA Queen RLHF

This module implements a comprehensive preference learning system for AIVA Queen's
Reinforcement Learning from Human Feedback (RLHF) pipeline. It enables the collection,
storage, modeling, and active querying of human preferences.

Key Components:
    - PreferenceCollector: Gather and validate human preference annotations
    - PreferenceDataset: Persistent storage and retrieval of preference pairs
    - BradleyTerryModel: Statistical model for preference prediction
    - PreferenceLearner: Online and batch learning from preferences
    - ActivePreferenceQuery: Information-theoretic selection of comparisons
    - PreferenceIntegrator: Bridge preferences into the training pipeline

Reference Papers:
    - Bradley-Terry Model (1952): Rank analysis of incomplete block designs
    - Christiano et al. (2017): Deep RL from Human Preferences
    - Rafailov et al. (2023): DPO - Direct Preference Optimization

Author: Genesis-OS AIVA Queen RLM Module
Version: 1.0.0
    )annotationsNz)/mnt/e/genesis-system/data/genesis-memory)PostgresConfig)ABCabstractmethod)defaultdictdeque)	dataclassfieldasdict)datetime	timedelta)Enumauto)Path)AnyCallableDictGenericIteratorListOptionalProtocolSetTupleTypeVarUnionz4%(asctime)s - %(name)s - %(levelname)s - %(message)s)levelformatzAIVA.PreferenceLearningTc                  ,    e Zd ZdZdZdZdZdZdZdZ	dZ
y	)
PreferenceStrengthz+Strength of preference between two options.         r   N)__name__
__module____qualname____doc__STRONGLY_PREFER_APREFER_ASLIGHTLY_PREFER_ATIESLIGHTLY_PREFER_BPREFER_BSTRONGLY_PREFER_B     J/mnt/e/genesis-system/AIVA/queen_outputs/rlm/rlm_01_preference_learning.pyr!   r!   S   s,    5H
CHr4   r!   c                  V    e Zd ZdZ e       Z e       Z e       Z e       Z e       Z	y)FeedbackQualityzQuality assessment of feedback.N)
r(   r)   r*   r+   r   HIGHMEDIUMLOWSPAMATTENTION_CHECK_FAILEDr3   r4   r5   r7   r7   ^   s)    )6DVF
&C6D!Vr4   r7   c                  H    e Zd ZdZ e       Z e       Z e       Z e       Zy)LearningStrategyz)Strategy for preference learning updates.N)	r(   r)   r*   r+   r   ONLINE
MINI_BATCH
FULL_BATCH	STREAMINGr3   r4   r5   r>   r>   g   s!    3VFJJIr4   r>   c                  V    e Zd ZdZ e       Z e       Z e       Z e       Z e       Z	y)QueryStrategyz(Strategy for active preference querying.N)
r(   r)   r*   r+   r   UNCERTAINTYINFORMATION_GAIN	DIVERSITYDISAGREEMENTHYBRIDr3   r4   r5   rD   rD   o   s)    2&KvI6LVFr4   rD   c                      e Zd ZU dZded<   ded<   ded<   dZded	<    eej                  
      Zded<   dZ	ded<    ee

      Zded<   ddZddZddZedd       Zy)Responsez A model response to be compared.
ResponseIdidstrprompttextNOptional[str]model_iddefault_factoryfloat	timestampzOptional[List[float]]	embeddingDict[str, Any]metadatac                ,    t        | j                        S N)hashrM   selfs    r5   __hash__zResponse.__hash__   s    DGG}r4   c                V    t        |t              sy| j                  |j                  k(  S )NF)
isinstancerK   rM   )r^   others     r5   __eq__zResponse.__eq__   s"    %*ww%((""r4   c                    | j                   | j                  | j                  | j                  | j                  | j
                  dS )NrM   rO   rP   rR   rV   rY   re   r]   s    r5   to_dictzResponse.to_dict   s6    ''kkII
 	
r4   c                     | |d   |d   |d   |j                  d      |j                  dt        j                               |j                  di             S )NrM   rO   rP   rR   rV   rY   re   )gettimeclsdatas     r5   	from_dictzResponse.from_dict   sR    Dz>fXXj)hh{DIIK8XXj"-
 	
r4   returnint)rb   objectro   boolro   rX   )rl   rX   ro   z
'Response')r(   r)   r*   r+   __annotations__rR   r
   ri   rV   rW   dictrY   r_   rc   rf   classmethodrm   r3   r4   r5   rK   rK   |   sn    *NK
I"Hm"TYY7Iu7'+I$+$T:Hn:#

 
 
r4   rK   c                  H   e Zd ZU dZded<   ded<   ded<   ded<   d	ed
<   ded<    eej                        Zded<   dZded<   dZ	ded<   e
j                  Zded<    ee      Zded<   dZded<   ed"d       Zed"d       Zed#d       Zed#d       Zd$d Zed%d!       Zy)&PreferencePairz5A pair of responses with human preference annotation.ComparisonIdrM   rN   rO   rK   
response_a
response_br!   
preferenceAnnotatorIdannotator_idrS   rU   rV         ?
confidenceNrQ   	reasoningr7   qualityrX   rY   Frr   is_attention_checkc                    | j                   j                  dkD  r| j                  S | j                   j                  dk  r| j                  S y)z+Get the winning response, or None for ties.r   N)r|   valuerz   r{   r]   s    r5   winnerzPreferencePair.winner   ?     ??  1$??"__""Q&??"r4   c                    | j                   j                  dkD  r| j                  S | j                   j                  dk  r| j                  S y)z*Get the losing response, or None for ties.r   N)r|   r   r{   rz   r]   s    r5   loserzPreferencePair.loser   r   r4   c                4    | j                   j                  dz  S )z,Get normalized preference margin in [-1, 1].      @)r|   r   r]   s    r5   marginzPreferencePair.margin   s     $$s**r4   c                &    | j                   dz   dz  S )z;Get probability label for A being preferred (for training).r          @)r   r]   s    r5   labelzPreferencePair.label   s     c!S((r4   c                p   | j                   | j                  | j                  j                         | j                  j                         | j
                  j                  | j                  | j                  | j                  | j                  | j                  j                  | j                  | j                  dS )NrM   rO   rz   r{   r|   r~   rV   r   r   r   rY   r   )rM   rO   rz   rf   r{   r|   namer~   rV   r   r   r   rY   r   r]   s    r5   rf   zPreferencePair.to_dict   s    ''kk//113//113//.. --//||(("&"9"9
 	
r4   c                    | |d   |d   t         j                  |d         t         j                  |d         t        |d      |d   |j                  dt	        j                               |j                  dd	      |j                  d
      t
        |j                  dd         |j                  di       |j                  dd            S )NrM   rO   rz   r{   r|   r~   rV   r   r   r   r   r9   rY   r   Fr   )rK   rm   r!   rh   ri   r7   rj   s     r5   rm   zPreferencePair.from_dict   s    Dz>))$|*<=))$|*<=)$|*<=n-hh{DIIK8xxc2hh{+#DHHY$ABXXj"-#xx(<eD
 	
r4   )ro   zOptional[Response])ro   rU   rs   )rl   rX   ro   z'PreferencePair')r(   r)   r*   r+   rt   r
   ri   rV   r   r   r7   r9   r   ru   rY   r   propertyr   r   r   r   rf   rv   rm   r3   r4   r5   rx   rx      s    ?K""TYY7Iu7J#I}#.55G_5$T:Hn:$$    + + ) )
  
 
r4   rx   c                      e Zd ZU dZded<   dZded<   dZded	<   d
Zded<   dZded<   dZ	ded<   dZ
ded<    eej                        Zded<   dZded<    ee      Zded<   	 	 	 d	 	 	 	 	 	 	 ddZy)AnnotatorProfilez2Profile tracking annotator quality and statistics.r}   rM   r   rp   total_annotations      ?rU   agreement_rater   attention_check_pass_rateaverage_confidence      >@average_time_secondsquality_scorerS   last_activeFrr   
is_trustedzDict[str, int]preferences_by_typeNc                h   | xj                   dz  c_         t        j                         | _        d}d|z
  | j                  z  ||z  z   | _        d|z
  | j                  z  ||z  z   | _        |2| j                   }|dz
  | j
                  z  t        |      z   |z  | _        d| j                  z  d| j
                  z  z   dt        d| j                        z  z   dd| j                  cxk  rdk  rn ndnd	z  z   | _	        | j                   d
k\  xr  | j                  dk\  xr | j
                  dk\  | _
        y)z%Update profile from a new annotation.r$   皙?N333333?g?r      i,  r   
   ffffff?g?)r   ri   r   r   r   r   rU   r   minr   r   )r^   passed_attentionr   
time_spentalphans         r5   update_from_annotationz'AnnotatorProfile.update_from_annotation  sO    	!#99; #$u90G0G"G%R\J\"\%&Y$2K2K$KeV`N`$`!'&&AQ$8885AQ;RR.D* $%%%$0001#c422334 !d77=#=33GH 	 ""b( 2#%2**c1 	r4   )Nr   r   )r   zOptional[bool]r   rU   r   rU   ro   None)r(   r)   r*   r+   rt   r   r   r   r   r   r   r
   ri   r   r   ru   r   r   r3   r4   r5   r   r      s    <OsNE'*u* ##"&%&M5tyy9K9J*/*EE ,0 	"
("
 "
 	"

 
"
r4   r   c                      e Zd ZdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 ddZddZddZ	 d	 	 	 	 	 	 	 	 	 ddZ	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ	e
j                  d	f	 	 	 	 	 dd
ZddZy)PreferenceCollectora"  
    Collects and validates human preference annotations.

    Features:
    - Multi-source preference collection (direct, comparison, implicit)
    - Quality control with attention checks
    - Annotator reliability tracking
    - Spam and noise filtering
    - Async batch collection
    c                    || _         || _        || _        || _        || _        i | _        i | _        | j                         | _        g | _	        t        j                         | _        t        j                  d       y)a  
        Initialize preference collector.

        Args:
            attention_check_frequency: Frequency of attention check insertions
            min_time_seconds: Minimum acceptable annotation time
            max_time_seconds: Maximum acceptable annotation time
            spam_threshold: Threshold for flagging spam annotators
            require_reasoning: Whether to require reasoning for preferences
        zPreferenceCollector initializedN)attention_check_frequencymin_time_secondsmax_time_secondsspam_thresholdrequire_reasoning_annotator_profiles_pending_comparisons_generate_attention_checks_attention_checks_collected_pairs	threadingLock_lockloggerinfo)r^   r   r   r   r   r   s         r5   __init__zPreferenceCollector.__init__6  sq    $ *C& 0 0,!2HJ HJ!7;7V7V7X68^^%
56r4   c                ~    dddt         j                  ddddt         j                  ddd	d
t         j                  dgS )z@Generate attention check comparisons with known correct answers.zWhat is 2 + 2?zThe answer is 4.zThe answer is banana.)rO   rz   r{   correctzWrite a greeting.zHello! Nice to meet you.zasdfghjkl qwertyzExplain what water is.zI don't know anything.zNWater is H2O, a molecule consisting of two hydrogen atoms and one oxygen atom.)r!   r,   r2   r]   s    r5   r   z.PreferenceCollector._generate_attention_checksV  sW     +05-??	 .80-??	 36n-??	
 	
r4   c                n    || j                   vrt        |      | j                   |<   | j                   |   S )z Get or create annotator profile.)rM   )r   r   )r^   r~   s     r5   get_annotator_profilez)PreferenceCollector.get_annotator_profilem  s6    t7775E5VD$$\2''55r4   Nc           
     8   t        t        j                               }t        j                         | j                  k  }|rt        j
                  | j                        }t        t        t        j                               |d   |d         }t        t        t        j                               |d   |d         }|d   }| j                  5  ||||xs i t        j                         ||rd   ndg d| j                  |<   ddd       |S # 1 sw Y   |S xY w)a  
        Create a new comparison task.

        Args:
            prompt: The prompt
            response_a: First response
            response_b: Second response
            metadata: Additional metadata

        Returns:
            Comparison ID
        rO   rz   rM   rO   rP   r{   r   N)rO   rz   r{   rY   
created_atr   correct_answerr   )rN   uuiduuid4randomr   choicer   rK   r   ri   r   )r^   rO   rz   r{   rY   comparison_idr   checks           r5   create_comparisonz%PreferenceCollector.create_comparisons  s   & DJJL) $]]_t/M/MMMM$"8"89E!tzz|$X<(J
 "tzz|$X<(J
 8_FZZ 
	 (($N"iik&86H%	"2d!	8D%%m4
	 
	 s   6DDc                  K   | j                   5  || j                  vr"t        j                  d|        	 ddd       y| j                  |   }|d   D cg c]  }|d   	 }	}||	v r"t        j                  d|        	 ddd       y	 ddd       | j	                  |||||      }
d}d   rr|d   }|j
                  d	kD  xr |j
                  d	kD  xs  |j
                  d	k  xr |j
                  d	k  }|s)t        j                  }
t        j                  d
| d       | j                  |      }|j                  |||       ||||||
t        j                         d}| j                   5  |d   j                  |       ddd       |
t        j                  t        j                  fv rtt        t        t!        j"                               |d   |d   |d   |||||
|d   |d         }| j                   5  | j$                  j                  |       ddd       d|fS yc c}w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   d|fS xY ww)a  
        Submit a preference annotation.

        Args:
            comparison_id: ID of the comparison
            annotator_id: ID of the annotator
            preference: The preference choice
            confidence: Annotator's confidence
            reasoning: Optional reasoning
            time_spent_seconds: Time spent on annotation

        Returns:
            Tuple of (accepted, resulting_pair)
        zUnknown comparison: N)FNr   r~   zDuplicate annotation from r   r   r   z
Annotator z failed attention check)r   r   r   )r~   r|   r   r   r   r   rV   rO   rz   r{   rY   )rM   rO   rz   r{   r|   r~   r   r   r   rY   r   T)r   r   r   warning_assess_qualityr   r7   r<   r   r   ri   appendr8   r9   rx   rN   r   r   r   )r^   r   r~   r|   r   r   time_spent_seconds
comparisonaexisting_annotatorsr   r   r   profile
annotationpairs                   r5   submit_preferencez%PreferenceCollector.submit_preference  s    . ZZ 	#D$=$==!5m_EF"	# 	#
 22=AJ ?I>W"X1^#4"X"X22!;L>JK"	# 	# 3	# &&*j)=O

  *+ !12G !!A%;'--!*; =!!A%;'--!*;  $)@@L>9PQR ,,\:&&-!) 	' 	
 )$$",

 ZZ 	9}%,,Z8	9 ++_-C-CDD!tzz|$!(+%l3%l3%)%##J/#-.B#CD  3%%,,T23 :E #Y	# 	#d	9 	9&3 :sq   I(H'	I H'H""H'	IC#I.H4A6I9I I"H''H1,I4H=9I I
Ic                   | j                  |      }d}|| j                  k  r|dz  }|| j                  kD  r|dz  }|dk  r|t        j                  k(  r|dz  }| j
                  r|s|dz  }|j                  | j                  k  r|dz  }|dk\  rt        j                  S |dk\  rt        j                  S |dk\  rt        j                  S t        j                  S )z.Assess the quality of a preference annotation.r   r#   r$   r   r"   )r   r   r   r!   r/   r   r   r   r7   r;   r:   r9   r8   )r^   r~   r|   r   r   r   r   spam_signalss           r5   r   z#PreferenceCollector._assess_quality	  s     ,,\:  ---AL ---AL 
.@.D.D DAL !!)AL   4#6#66AL 1"'''Q"&&&Q")))"'''r4   Tc           
        t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  di}||   }| j                  5  | j                  D cg c]  }||j                     |k\  r| }}|r|D cg c]  }|j                  r| }}|cddd       S c c}w c c}w # 1 sw Y   yxY w)z3Get collected preference pairs filtered by quality.r"   r#   r$   r   N)
r7   r8   r9   r:   r;   r<   r   r   r   r   )r^   min_qualityexclude_attention_checksquality_rankmin_rankppairss          r5   get_collected_pairsz'PreferenceCollector.get_collected_pairs5  s       !""A  !22A
  ,ZZ 		00		*h6 E 
 ($)Fq1E1EFF		 		 G		 		s0   $C3B9CB>(B>,C9
CCc                8   | j                   5  | j                  }t        | j                  j	                               }ddd       t              t              t        D ci c]!  j                  t        fd|D              # c}t        D ci c]!  j                  t        fd|D              # c}|r-t        j                  |D cg c]  }|j                   c}      ndt        d |D              dS # 1 sw Y   xY wc c}w c c}w c c}w )zGet collection statistics.Nc              3  B   K   | ]  }|j                   k(  sd   ywr$   N)r   ).0r   qs     r5   	<genexpr>z5PreferenceCollector.get_statistics.<locals>.<genexpr>Y  s     ?!		QA?   c              3  B   K   | ]  }|j                   k(  sd   ywr   )r|   )r   r   ss     r5   r   z5PreferenceCollector.get_statistics.<locals>.<genexpr>]  s     B!0AABr   r   c              3  :   K   | ]  }|j                   sd   ywr   )r   )r   r   s     r5   r   z5PreferenceCollector.get_statistics.<locals>.<genexpr>a  s     %JAQ\\a%Js   )total_pairstotal_annotatorsquality_distributionpreference_distributionr   trusted_annotators)r   r   listr   valueslenr7   r   sumr!   
statisticsmeanr   )r^   r   profilesr   r   r   s      `` r5   get_statisticsz"PreferenceCollector.get_statisticsO  s    ZZ 	?))ED44;;=>H	?
 u: #H )% ?u???% ,( BuBBB( UZ*//2OA1<<2O"P_`"%%J%J"J
 	
		? 	?%( 3Ps   0D$&D&D
DD
)r   r   g     @r   F)
r   rU   r   rU   r   rU   r   rU   r   rr   )ro   List[Dict[str, Any]])r~   r}   ro   r   r[   )
rO   rN   rz   rK   r{   rK   rY   zOptional[Dict[str, Any]]ro   ry   )r   Nr   )r   ry   r~   r}   r|   r!   r   rU   r   rQ   r   rU   ro   z%Tuple[bool, Optional[PreferencePair]])r~   r}   r|   r!   r   rU   r   rQ   r   rU   ro   r7   )r   r7   r   rr   ro   List[PreferencePair]rs   )r(   r)   r*   r+   r   r   r   r   r   r   r7   r9   r   r   r3   r4   r5   r   r   *  sd   	 ,/"%"' #"'7#(7  7  	7
 7  7@
.6 .211 1 	1
 +1 
1p  #'$(a#a "a '	a
 a !a "a 
/aF*(!*( '*( 	*(
 !*( *( 
*(\ (7'='=)-$ #' 
	4
r4   r   c                      e Zd ZdZ	 	 d	 	 	 ddZd ZddZddZddZddZ	dd	Z
dd
ZddZddZ	 	 d	 	 	 	 	 	 	 ddZddZddZddZed d!d       Zy)"PreferenceDataseta  
    Persistent storage and retrieval of preference pairs.

    Features:
    - PostgreSQL-backed persistence (Elestio)
    - Efficient batch operations
    - Filtering and sampling
    - Cross-annotator agreement computation
    - Export/import capabilities
    Nc                :   || _         i | _        t        t              | _        t        t              | _        t        j                         | _        | j                          | j                          t        j                  dt        | j                         d       y)z
        Initialize preference dataset.

        Args:
            db_path: Deprecated (ignored). Uses Elestio PostgreSQL.
            min_confidence: Minimum confidence threshold for retrieval
        z#PreferenceDataset initialized with z pairsN)min_confidence_pairsr   r   
_by_prompt_by_responser   r   r   _init_database_load_from_databaser   r   r   )r^   db_pathr  s      r5   r   zPreferenceDataset.__init__u  sr     -:<9DT9JBMdBS^^%
  "9#dkk:J9K6RSr4   c                P    t        j                  di t        j                         S )z)Get a PostgreSQL connection from Elestio.r3   )psycopg2connectr   get_connection_paramsr]   s    r5   	_get_connzPreferenceDataset._get_conn  s    I."F"F"HIIr4   c                    | j                         }	 |j                         5 }|j                  d       ddd       |j                          |j	                          y# 1 sw Y   *xY w# |j	                          w xY w)zCEnsure PostgreSQL tables exist (tables should already exist in PG).a   
                    CREATE TABLE IF NOT EXISTS pl_preference_pairs (
                        id TEXT PRIMARY KEY,
                        prompt TEXT NOT NULL,
                        response_a_id TEXT NOT NULL,
                        response_a_text TEXT NOT NULL,
                        response_b_id TEXT NOT NULL,
                        response_b_text TEXT NOT NULL,
                        preference TEXT NOT NULL,
                        annotator_id TEXT NOT NULL,
                        timestamp DOUBLE PRECISION,
                        confidence REAL DEFAULT 1.0,
                        reasoning TEXT,
                        quality TEXT,
                        metadata TEXT,
                        is_attention_check INTEGER DEFAULT 0
                    )
                N)r  cursorexecutecommitclose)r^   conncurs      r5   r  z PreferenceDataset._init_database  sd    ~~	 #  & KKMJJL+ * JJLs!   A) AA) A&"A) )A;c                   | j                         }	 |j                  t        j                  j                        5 }|j                  d       |D ]k  }t        |d   |d   t        |d   |d   |d         t        |d   |d   |d	         t        |d
      |d   |d   |d   |d   |d   rt        |d      nt        j                  |d   rt        j                  |d         ni t        |d               }|| j                  |j                  <   | j                   |j"                     j%                  |j                         | j&                  |j(                  j                     j%                  |j                         | j&                  |j*                  j                     j%                  |j                         n 	 ddd       |j-                          y# 1 sw Y   xY w# |j-                          w xY w)z"Load existing pairs from database.)cursor_factoryz!SELECT * FROM pl_preference_pairsrM   rO   response_a_idresponse_a_textr   response_b_idresponse_b_textr|   r~   rV   r   r   r   rY   r   r   N)r  r  r  extrasRealDictCursorr  rx   rK   r!   r7   r9   jsonloadsrr   r  rM   r  rO   r   r  rz   r{   r  )r^   r  r  rowr   s        r5   r  z%PreferenceDataset._load_from_database  s   ~~ 	HOO,J,JK Js?@ JC)t9"8}#+"?3#&x=!$%6!7$
 $,"?3#&x=!$%6!7$
 $6c,6G#H%(%8"%k"2#&|#4"%k"2CFy>I ?WfWmWm@CJC
O!<UW+/4H0I+J)D, ,0DKK(OODKK077@%%doo&8&89@@I%%doo&8&89@@I5JJ> JJL?J J> JJLs#   *G% FG G% G"G% %G7c                   | j                   5  || j                  |j                  <   | j                  |j                     j                  |j                         | j                  |j                  j                     j                  |j                         | j                  |j                  j                     j                  |j                         ddd       | j                  |       |j                  S # 1 sw Y   &xY w)z%Add a preference pair to the dataset.N)
r   r  rM   r  rO   r   r  rz   r{   
_save_pairr^   r   s     r5   addzPreferenceDataset.add  s    ZZ 	B#'DKK OODKK(//8doo00188Adoo00188A		B 	ww	B 	Bs   CC66C?c                X    g }|D ]"  }|j                  | j                  |             $ |S )zAdd multiple preference pairs.)r   r$  )r^   r   idsr   s       r5   	add_batchzPreferenceDataset.add_batch  s/     	'DJJtxx~&	'
r4   c                   | j                         }	 |j                         5 }|j                  d|j                  |j                  |j
                  j                  |j
                  j                  |j                  j                  |j                  j                  |j                  j                  |j                  |j                  |j                  |j                  |j                  j                  t        j                   |j"                        t%        |j&                        f       ddd       |j)                          |j+                          y# 1 sw Y   *xY w# |j+                          w xY w)zSave pair to database.a  
                    INSERT INTO pl_preference_pairs
                    (id, prompt, response_a_id, response_a_text, response_b_id,
                     response_b_text, preference, annotator_id, timestamp,
                     confidence, reasoning, quality, metadata, is_attention_check)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (id) DO UPDATE SET
                        prompt = EXCLUDED.prompt,
                        response_a_id = EXCLUDED.response_a_id,
                        response_a_text = EXCLUDED.response_a_text,
                        response_b_id = EXCLUDED.response_b_id,
                        response_b_text = EXCLUDED.response_b_text,
                        preference = EXCLUDED.preference,
                        annotator_id = EXCLUDED.annotator_id,
                        timestamp = EXCLUDED.timestamp,
                        confidence = EXCLUDED.confidence,
                        reasoning = EXCLUDED.reasoning,
                        quality = EXCLUDED.quality,
                        metadata = EXCLUDED.metadata,
                        is_attention_check = EXCLUDED.is_attention_check
                N)r  r  r  rM   rO   rz   rP   r{   r|   r   r~   rV   r   r   r   r  dumpsrY   rp   r   r  r  )r^   r   r  r  s       r5   r"  zPreferenceDataset._save_pair  s   ~~(	 $# * GGKKOO&&OO((OO&&OO((OO((%%NNOONNLL%%JJt}}-//0)#$J KKMJJLO$ $N JJLs#   E DE'E EE E.c                8    | j                   j                  |      S )zGet a preference pair by ID.)r  rh   )r^   pair_ids     r5   rh   zPreferenceDataset.get  s    {{w''r4   c                    | j                   5  | j                  j                  |g       }|D cg c]   }|| j                  v s| j                  |   " c}cddd       S c c}w # 1 sw Y   yxY w)z&Get all preference pairs for a prompt.N)r   r  rh   r  )r^   rO   pair_idspids       r5   get_by_promptzPreferenceDataset.get_by_prompt  sc    ZZ 	P**626H08OC4;;<NDKK$O	P 	PO	P 	P"   !A$AAA$A$$A-c                    | j                   5  | j                  j                  |g       }|D cg c]   }|| j                  v s| j                  |   " c}cddd       S c c}w # 1 sw Y   yxY w)z.Get all preference pairs involving a response.N)r   r  rh   r  )r^   response_idr-  r.  s       r5   get_by_responsez!PreferenceDataset.get_by_response  se    ZZ 	P((,,["=H08OC4;;<NDKK$O	P 	PO	P 	Pr0  c                   |xs | j                   }| j                  5  | j                  j                         D cg c]  }|j                  |k\  r|j
                  s|! }}ddd       sg S |r&t        t              }|D ]   }||j                     j                  |       " g }t        d|t        |      z        }	|j                         D ]=  \  }
}|j                  t        j                  |t!        |	t        |                         ? t        |      |k  rut        |      t        |      k  r^|D cg c]	  }||vs| }}|j                  t        j"                  |             t        |      |k  rt        |      t        |      k  r^|d| S t        j                  |t!        |t        |                  S c c}w # 1 sw Y   exY wc c}w )z
        Sample preference pairs from the dataset.

        Args:
            n: Number of pairs to sample
            stratified: Whether to stratify by preference type
            min_confidence: Minimum confidence threshold
        Nr$   )r  r   r  r   r   r   r   r   r|   r   maxr   itemsextendr   sampler   r   )r^   r   
stratifiedr  min_confr   eligibleby_prefsampled	per_groupprefr   	remainings                r5   r8  zPreferenceDataset.sample  s    "8T%8%8ZZ 	;;--/<<8+A4H4H H 	 IFQRVFWG 0%,,Q/0 GAqCL01I&}} Qev}}UC	3u:4NOPQ g,"s7|c(m'C(0E1AW4DQE	Ev}}Y78 g,"s7|c(m'C 2A;==3q#h-+@AA7	 	. Fs(   G$F>G6	G G>GGc                ,    t        | j                        S r[   )r   r  r]   s    r5   __len__zPreferenceDataset.__len__M  s    4;;r4   c                H    t        | j                  j                               S r[   )iterr  r   r]   s    r5   __iter__zPreferenceDataset.__iter__P  s    DKK&&())r4   c                   | j                   5  | j                  j                         D cg c]  }|j                          }}ddd       t	        |d      5 }t        j                  t        j                         j                         d|d       ddd       t        j                  dt               d|        yc c}w # 1 sw Y   xY w# 1 sw Y   ?xY w)zExport dataset to JSON file.Nw)r   exported_atr#   )indentz	Exported z
 pairs to )r   r  r   rf   openr  dumpr   now	isoformatr   r   r   )r^   filepathr   r   fs        r5   export_jsonzPreferenceDataset.export_jsonS  s    ZZ 	@*.++*<*<*>?QQYY[?E?	@ (C  	`AIIhlln6N6N6PQST]^_	` 	iE
|:hZ@A @	@ 	@	` 	`s(   CC C<C CCCc                    t        |d      5 }t        j                  |      }ddd        | |      }j                  dg       D ]&  }|j	                  t
        j                  |             ( |S # 1 sw Y   OxY w)zImport dataset from JSON file.rNr	  r   )rJ  r  loadrh   r$  rx   rm   )rk   rN  r	  rO  rl   dataset	pair_datas          r5   import_jsonzPreferenceDataset.import_json]  sv     (C  	 A99Q<D	  g&'2. 	=IKK00;<	= 	  	 s   A11A:)Nr   )r	  rQ   r  rU   )ro   r   )r   rx   ro   rN   )r   r   ro   z	List[str])r   rx   ro   r   )r+  ry   ro   zOptional[PreferencePair])rO   rN   ro   r   )r2  rL   ro   r   )TN)r   rp   r9  rr   r  zOptional[float]ro   r   rn   )ro   zIterator[PreferencePair]rN  rN   ro   r   r[   )rN  rN   r	  rQ   ro   z'PreferenceDataset')r(   r)   r*   r+   r   r  r  r  r$  r'  r"  rh   r/  r3  r8  rB  rE  rP  rv   rW  r3   r4   r5   r  r  i  s    	 "& #TT T.J6#J	+Z(PP  *.	,B,B ,B (	,B
 
,B\ *B 	 	r4   r  c                      e Zd ZdZ	 	 	 	 d	 	 	 	 	 	 	 ddZddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZddZ	 	 d	 	 	 	 	 	 	 ddZ	ddZ
dd	Zdd
Zedd       Zy)BradleyTerryModela  
    Bradley-Terry model for preference prediction.

    The Bradley-Terry model estimates the probability that response A is
    preferred over response B as:
        P(A > B) = sigma(r_A - r_B)

    Where r_A and r_B are learned "strength" parameters for each response,
    and sigma is the sigmoid function.

    Features:
    - Online and batch parameter estimation
    - Regularization (L2 penalty)
    - Uncertainty estimation via Laplace approximation
    - Efficient incremental updates
    c                    || _         || _        | _        || _        t	        fd      | _        t	        t              | _        d| _        t	        t              | _
        y)a@  
        Initialize Bradley-Terry model.

        Args:
            learning_rate: Learning rate for parameter updates
            regularization: L2 regularization strength
            prior_strength: Prior strength for all responses
            use_confidence_weighting: Weight updates by annotator confidence
        c                      S r[   r3   prior_strengths   r5   <lambda>z,BradleyTerryModel.__init__.<locals>.<lambda>  s    ~ r4   r   N)learning_rateregularizationr^  use_confidence_weightingr   
_strengthsrp   _counts_pair_countrU   _hessian_diag)r^   r`  ra  r^  rb  s      ` r5   r   zBradleyTerryModel.__init__  sV      +,,(@%3>?U3V.9#.> 7B%6Hr4   c                    |dk\  rt        j                  |       }dd|z   z  S t        j                  |      }|d|z   z  S )zNumerically stable sigmoid.r   r$   )mathexp)r^   xzs      r5   _sigmoidzBradleyTerryModel._sigmoid  sA    6!AA;AA;r4   c                f    | j                   |   }| j                   |   }| j                  ||z
        S )z
        Predict probability that A is preferred over B.

        Args:
            response_a: ID of response A
            response_b: ID of response B

        Returns:
            P(A > B) in [0, 1]
        )rc  rl  r^   rz   r{   
strength_a
strength_bs        r5   predict_preferencez$BradleyTerryModel.predict_preference  s4     __Z0
__Z0
}}Z*455r4   c                   dt        | j                  |   | j                  z   d      z  }dt        | j                  |   | j                  z   d      z  }||z   }| j                  ||      }| t	        j
                  |dz         z  d|z
  t	        j
                  d|z
  dz         z  z
  }|t	        j                  |      z  S )a+  
        Get uncertainty in preference prediction.

        Uses Laplace approximation to estimate prediction variance.

        Args:
            response_a: ID of response A
            response_b: ID of response B

        Returns:
            Uncertainty score (higher = more uncertain)
        r   gư>绽|=r$   )r5  rf  ra  rq  rh  logsqrt)r^   rz   r{   var_avar_bvar_diffprobentropys           r5   get_uncertaintyz!BradleyTerryModel.get_uncertainty  s    $ c$,,Z84;N;NNPTUUc$,,Z84;N;NNPTUU 5= &&z:>%$((4%<00AHTTYIY@Z3ZZ8,,,r4   c                   |j                   j                  }|j                  j                  }|j                  }| j                  r|j
                  nd}| j                  ||      }||z
  }||z  }| j                  | j                  |   z  }	| j                  | j                  |   z  }
| j                  |xx   | j                  ||	z   z  z  cc<   | j                  |xx   | j                  ||
z
  z  z  cc<   | j                  |xx   dz  cc<   | j                  |xx   dz  cc<   | xj                  dz  c_        ||z  d|z
  z  }| j                  |xx   |z  cc<   | j                  |xx   |z  cc<   | t        j                  |dz         z  d|z
  t        j                  d|z
  dz         z  z
  }||t        |      | j                  |   | j                  |   dS )z
        Update model parameters from a single preference pair.

        Args:
            pair: Preference pair to learn from

        Returns:
            Dictionary with update metrics
        r   r$   rs  )loss
predictionerrorro  rp  )rz   rM   r{   r   rb  r   rq  ra  rc  r`  rd  re  rf  rh  rt  abs)r^   r   rz   r{   r   weightpredr  grad
reg_grad_a
reg_grad_bhessr}  s                r5   updatezBradleyTerryModel.update  s    __''
__''


$($A$As &&z:> u~ ((4??:+FF
((4??:+FF
 	
#t'9'9TJ=N'OO#
#t'9'9TJ=N'OO# 	Z A% Z A% A }D):&$.&:&$.& v..!e)txxDSXHX?Y1YY Z//*5//*5
 	
r4   c           
        g g d}t        |      D ]  }|j                         }|rt        j                  |       d}d}|D ]=  }	| j	                  |	      }
||
d   z  }|
d   dkD  }|	j
                  dkD  }||k(  s9|dz  }? |r|t        |      z  nd}|r|t        |      z  nd}|d   j                  |       |d   j                  |       t        j                  d	|dz    d
|dd|d        |S )a(  
        Fit model on a batch of preference pairs.

        Args:
            pairs: List of preference pairs
            epochs: Number of training epochs
            shuffle: Whether to shuffle pairs each epoch

        Returns:
            Dictionary with training metrics over epochs
        r}  accuracy        r   r}  r~  r   r$   r  zEpoch z: loss=z.4fz, accuracy=)
rangecopyr   shuffler  r   r   r   r   debug)r^   r   epochsr  metricsepochepoch_pairs
epoch_lossr   r   resultpred_a_winsactual_a_winsavg_lossr  s                  r5   fitzBradleyTerryModel.fit	  s   " 2.6] 	]E**,K{+JG# !T*fVn,
 %\2S8 $

S 0-/qLG! 38zCJ.QH/4wU+!HFO""8,J&&x0LL6%!)GHS>XVYN[\1	]4 r4   c                f    |D cg c]  }|| j                   |   f }}t        |d d      S c c}w )z
        Get ranked list of responses by strength.

        Args:
            response_ids: List of response IDs to rank

        Returns:
            List of (response_id, strength) tuples sorted by strength
        c                    | d   S )Nr$   r3   rj  s    r5   r_  z/BradleyTerryModel.get_ranking.<locals>.<lambda>C  s
    AaD r4   Tkeyreverse)rc  sorted)r^   response_idsridrankeds       r5   get_rankingzBradleyTerryModel.get_ranking8  s;     :FF#3,-FFf.$?? Gs   .c                     | j                   |   S )z*Get the strength parameter for a response.)rc  )r^   r2  s     r5   get_strengthzBradleyTerryModel.get_strengthE  s    {++r4   c                |   t        | j                        t        | j                        t        | j                        | j                  | j
                  | j                  | j                  dd}t        |d      5 }t        j                  ||       ddd       t        j                  d|        y# 1 sw Y   "xY w)zSave model to file.)r`  ra  r^  )	strengthscountshessian_diag
pair_countconfigwbNzModel saved to )ru   rc  rd  rf  re  r`  ra  r^  rJ  picklerK  r   r   )r^   rN  staterO  s       r5   savezBradleyTerryModel.saveI  s     doo.4<<( !3!34**!%!3!3"&"5"5"&"5"5

 (D! 	"QKKq!	"ohZ01	" 	"s   :B22B;c                \   t        |d      5 }t        j                  |      }ddd        | d
i d   t        fd|d         _        t        t
        |d         _        t        t        |d         _        |d   _	        t        j                  d	|        S # 1 sw Y   xY w)zLoad model from file.rbNr  c                      j                   S r[   r]  )models   r5   r_  z(BradleyTerryModel.load.<locals>.<lambda>a  s    u/C/C r4   r  r  r  r  zModel loaded from r3   )rJ  r  rT  r   rc  rp   rd  rU   rf  re  r   r   )rk   rN  rO  r  r  s       @r5   rT  zBradleyTerryModel.loadZ  s     (D! 	#QKKNE	# &eHo&&'CU;EWX#Cx9)%~1FG!,/(
34	# 	#s   B""B+N)r   {Gz?r  T)r`  rU   ra  rU   r^  rU   rb  rr   )rj  rU   ro   rU   )rz   rL   r{   rL   ro   rU   )r   rx   ro   Dict[str, float])r   T)r   r   r  rp   r  rr   ro   zDict[str, List[float]])r  zList[ResponseId]ro   zList[Tuple[ResponseId, float]])r2  rL   ro   rU   rX  )rN  rN   ro   z'BradleyTerryModel')r(   r)   r*   r+   r   rl  rq  r{  r  r  r  r  r  rv   rT  r3   r4   r5   rZ  rZ  n  s    &  # $ #)-II I 	I
 #'I866 6 
	6&-- - 
	-<1
l 	-#- - 	-
 
 -^@,2"  r4   rZ  c                      e Zd ZdZdej
                  dddf	 	 	 	 	 	 	 	 	 ddZddZ	 d	 	 	 	 	 ddZdd	Z	dd
Z
ddZy)PreferenceLearnera  
    Learns from human preferences using various strategies.

    Features:
    - Online, mini-batch, and full batch learning
    - Multiple model backends (Bradley-Terry, neural)
    - Curriculum learning support
    - Learning rate scheduling
    - Validation and early stopping
    N    r   r   c                    |xs
 t               | _        || _        || _        || _        || _        g | _        g | _        t        d      | _	        d| _
        y)aD  
        Initialize preference learner.

        Args:
            model: Preference model (creates default if None)
            strategy: Learning strategy
            batch_size: Batch size for mini-batch learning
            validation_split: Fraction for validation
            patience: Early stopping patience
        infr   N)rZ  r  strategy
batch_sizevalidation_splitpatience_buffer_training_historyrU   _best_val_loss_patience_counter)r^   r  r  r  r  r  s         r5   r   zPreferenceLearner.__init__z  sT    $ 1/1
 $ 0 -/9;#El!"r4   c                ^   | j                   t        j                  k(  r| j                  j	                  |      S | j
                  j                  |       | j                   t        j                  k(  r=t        | j
                        | j                  k\  r| j                  | j
                        S y)z
        Learn from a single preference pair.

        Args:
            pair: Preference pair to learn from

        Returns:
            Metrics if update occurred, None otherwise
        N)r  r>   r?   r  r  r  r   r@   r   r  _train_batchr#  s     r5   learnzPreferenceLearner.learn  s~     ==,333::$$T**D!==,7774<< DOO3((66r4   c                   t        j                  |       t        t        |      d| j                  z
  z        }|d| }||d }g g dd}t        |      D ]  }| j                  |      }|d   j                  |       |r| j                  |      }	|d   j                  |	       |	d   | j                  k  r|	d   | _	        d| _
        ||d<   nI| xj                  dz  c_
        | j                  | j                  k\  rt        j                  d	|         |S | j                  j                  ||d   |d
   |r	d   ndd        |S )z
        Learn from a batch of preference pairs.

        Args:
            pairs: List of preference pairs
            epochs: Number of training epochs

        Returns:
            Training results
        r$   Nr   )train_metricsval_metrics
best_epochr  r  r}  r  zEarly stopping at epoch r  )r  
train_losstrain_accuracyval_loss)r   r  rp   r   r  r  r  r   	_evaluater  r  r  r   r   r  )
r^   r   r  	split_idxtrain_pairs	val_pairsresultsr  train_result
val_results
             r5   learn_batchzPreferenceLearner.learn_batch  sl     	uE
a$*?*?&?@A	JY')*%	  
 6] 	E,,[9LO$++L9 !^^I6
&--j9 f%(;(;;*4V*<D'-.D*,1GL)**a/*-->&>ug$FG  ""))*62".z":2;Jv.	+ +	8 r4   c                0   d}d}t        j                  |       |D ]C  }| j                  j                  |      }||d   z  }|d   dkD  |j                  dkD  k(  s?|dz  }E g | _        |r|t        |      z  nd|r|t        |      z  ndt        |      dS )zTrain on a batch of pairs.r  r   r}  r~  r   r$   )r}  r  n_pairs)r   r  r  r  r   r  r   )r^   r   
total_lossr   r   r  s         r5   r  zPreferenceLearner._train_batch  s    
u 	DZZ&&t,F&.(J|$s*

S0@A1		  05JU+!05#e*,15z
 	
r4   c                   d}d}|D ]  }| j                   j                  |j                  j                  |j                  j                        }|j
                  }| t        j                  |dz         z  d|z
  t        j                  d|z
  dz         z  z
  }||z  }|dkD  |dkD  k(  s|dz  } |r|t        |      z  nd|r|t        |      z  dS ddS )z,Evaluate on a set of pairs without updating.r  r   rs  r$   r   r  )	r  rq  rz   rM   r{   r   rh  rt  r   )r^   r   r  r   r   r  r   r}  s           r5   r  zPreferenceLearner._evaluate  s    
 	D::001C1CT__EWEWXDJJE6DHHTE\22a%i488APTHW\L\C]5]]D$Js
,1	 05JU+!05#e*,
 	
;<
 	
r4   c                    | j                   S )zGet training history.)r  r]   s    r5   get_training_historyz&PreferenceLearner.get_training_history  s    %%%r4   )
r  zOptional[BradleyTerryModel]r  r>   r  rp   r  rU   r  rp   )r   rx   ro   zOptional[Dict[str, float]])r   )r   r   r  rp   ro   rX   )r   r   ro   r  )ro   zList[Dict[str, float]])r(   r)   r*   r+   r>   r@   r   r  r  r  r  r  r3   r4   r5   r  r  n  s    	 .2%5%@%@"%#*# ## 	#
  # #:0 7#7 7 
	7r
(
(&r4   r  c                      e Zd ZdZej
                  df	 	 	 	 	 d	dZ	 d
	 	 	 	 	 ddZddZ	 	 	 	 	 	 	 	 	 	 ddZ		 d	 	 	 	 	 ddZ
y)ActivePreferenceQueryad  
    Selects the most informative comparisons for human annotation.

    Uses information-theoretic criteria to select comparisons that
    will most reduce model uncertainty.

    Features:
    - Multiple query strategies (uncertainty, information gain, diversity)
    - Batch query generation
    - Diversity constraints
    - Budget-aware selection
    r   c                L    || _         || _        || _        t               | _        y)z
        Initialize active query selector.

        Args:
            model: Preference model for uncertainty estimation
            strategy: Query selection strategy
            diversity_weight: Weight for diversity in hybrid strategy
        N)r  r  diversity_weightset_queried_pairs)r^   r  r  r  s       r5   r   zActivePreferenceQuery.__init__$  s$     
  0BE%r4   c                ~   |sg S |D cg c]Q  \  }}|j                   |j                   f| j                  vr(|j                   |j                   f| j                  vr||fS }}}|sg S g }|D ]+  \  }}| j                  ||      }|j                  |||f       - g }t	        |      |k  r|r
|j                  d d       |j                  d      }|j                  |d   |d   f       | j                  j                  |d   j                   |d   j                   f       | j                  dkD  rtt        |      D ]f  \  }	\  }}}| j                  |d   j                   |d   j                   |j                   |j                         }
|||d| j                  |
z  z
  z  f||	<   h t	        |      |k  r|r
|S c c}}w )z
        Select the most informative comparisons.

        Args:
            candidates: List of candidate (response_a, response_b) pairs
            n: Number of queries to select

        Returns:
            Selected comparison pairs
        c                    | d   S )Nr#   r3   r  s    r5   r_  z4ActivePreferenceQuery.select_query.<locals>.<lambda>^  s
    ad r4   Tr  r   r$   )rM   r  _compute_query_scorer   r   sortpopr$  r  	enumerate_compute_similarity)r^   
candidatesr   r   bscoredscoreselectedbesti
similaritys              r5   select_queryz"ActivePreferenceQuery.select_query8  s    I  *
q!add|4#6#66qttD$7$77 F

 
 I  	)DAq--a3EMM1a-(	)
 (maFKKNDK9 ::a=DOOT!Wd1g./##T!WZZa$<= $$q((1&(9 Y$A}1e!%!9!9$q'**d1gjjRSRVRVXYX\X\!]J!"AuD4I4IJ4V0V'W XF1IY (maF A
s   AF9c                     j                   t        j                  k(  r0 j                  j	                  j
                  j
                        S  j                   t        j                  k(  r j                  j	                  j
                  j
                        } j                  j                  j
                  j
                        }| t        j                  |dz         z  d|z
  t        j                  d|z
  dz         z  z
  }||z  S  j                   t        j                  k(  r\ j                  syd} j                  D ]9  \  }} j                  ||j
                  j
                        }	t        ||	      }; d|z
  S  j                   t        j                  k(  r j                  j	                  j
                  j
                        }
d} j                  r%t         fd j                  D              }d|z
  }d j                  z
  |
z   j                  |z  z   S t!        j                          S )z$Compute query informativeness score.rs  r$   r   c              3  t   K   | ]/  \  }}j                  ||j                  j                         1 y wr[   )r  rM   )r   qaqbr   r  r^   s      r5   r   z=ActivePreferenceQuery._compute_query_score.<locals>.<genexpr>  s6      %B ,,RQTT144@%s   58)r  rD   rE   r  r{  rM   rF   rq  rh  rt  rG   r  r  r   rI   r  r   )r^   r   r  uncertaintyry  rz  min_similarityr  r  simuncertainty_scorediversity_scores   ```         r5   r  z*ActivePreferenceQuery._compute_query_scorem  s   ==M555::--addADD99]]m<<<**44QTT144@K::00qtt<D edhhte|44DDHHQQUXX]M]D^7^^G((]]m555&& N-- :B..r2qttQTTB!$^S!9: ~%%]]m222 $

 : :144 F!O""!$ %"&"5"5% " #$n"4---1BBTEZEZ]lElll}}r4   c                8    ||h}||h}t        ||z        }|dz  S )z0Compute similarity between two comparison pairs.r   )r   )r^   a1b1a2b2set1set2overlaps           r5   r  z)ActivePreferenceQuery._compute_similarity  s.     BxBxdTk"}r4   c                    g }t        |      D ]9  \  }}||dz   d D ])  }t        |      |k\  r|c c S |j                  ||f       + ; |S )z
        Generate candidate comparison pairs from responses.

        Args:
            responses: List of responses
            max_pairs: Maximum pairs to generate

        Returns:
            List of candidate pairs
        r$   N)r  r   r   )r^   	responses	max_pairsr  r  r   r  s          r5   generate_candidate_pairsz.ActivePreferenceQuery.generate_candidate_pairs  sg     
i( 	*DAqq1uv& *z?i/%%!!1a&)*	* r4   N)r  rZ  r  rD   r  rU   )r$   )r  List[Tuple[Response, Response]]r   rp   ro   r  )r   rK   r  rK   ro   rU   )
r  rL   r  rL   r  rL   r  rL   ro   rU   )i  )r  List[Response]r  rp   ro   r  )r(   r)   r*   r+   rD   rE   r   r  r  r  r  r3   r4   r5   r  r    s      #0";";"%	H H  H  	H. 333 3 
)	3j'R

&
 
 '
 
	
 !  
)	r4   r  c                  t    e Zd ZdZ	 d		 	 	 	 	 d
dZddZ	 	 	 	 	 	 ddZ	 d		 	 	 	 	 ddZ	 	 	 	 	 	 ddZddZ	y)PreferenceIntegratoran  
    Integrates learned preferences into the training pipeline.

    Bridges the preference learning system with the RLHF training loop,
    providing preference-based rewards and training signals.

    Features:
    - Preference-to-reward conversion
    - Batch reward computation
    - Training signal generation for DPO/PPO
    - Model comparison and ranking
    c                .    || _         || _        || _        y)z
        Initialize preference integrator.

        Args:
            model: Trained preference model
            dataset: Preference dataset
            beta: Temperature parameter for reward scaling
        Nr  rU  beta)r^   r  rU  r  s       r5   r   zPreferenceIntegrator.__init__  s     
	r4   c                f    | j                   | j                  j                  |j                        z  S )z
        Compute reward for a response based on learned preferences.

        Args:
            response: Response to score

        Returns:
            Reward value
        )r  r  r  rM   )r^   responses     r5   compute_rewardz#PreferenceIntegrator.compute_reward  s&     yy4::228;;???r4   c                    | j                   j                  |j                        }| j                   j                  |j                        }| j                  |z  | j                  |z  fS )z
        Compute pairwise rewards for DPO training.

        Args:
            response_a: First response
            response_b: Second response

        Returns:
            Tuple of (reward_a, reward_b)
        )r  r  rM   r  rn  s        r5   compute_pairwise_rewardz,PreferenceIntegrator.compute_pairwise_reward  sQ     ZZ,,Z]];
ZZ,,Z]];
yy:%tyy:'===r4   c                F   | j                   j                  |dz        }g }|D ]  }| j                  j                  |j                  j
                  |j                  j
                        }t        |dz
        |dz  k\  s\|dkD  r|j                  |j                  }}n|j                  |j                  }}|j                  |j                  |j                  |j                  |j
                  |j
                  t        |dz
        dz  d       t        |      |k\  s |S  |S )z
        Get training pairs with clear preference margins.

        Args:
            n: Number of pairs to retrieve
            min_margin: Minimum preference probability margin

        Returns:
            List of training dictionaries
        r#   r   )rO   chosenrejected	chosen_idrejected_idr   )rU  r8  r  rq  rz   rM   r{   r  r   rO   rP   r   )	r^   r   
min_marginr   training_pairsr   prob_ar  r  s	            r5   get_training_pairsz'PreferenceIntegrator.get_training_pairs  s    ##AE* 	DZZ22""""F 6C< JN2C<'+HF'+HF%%"kk$kk (!'#+;;!&3,/!3'  ~&!+3	2 r4   c                    |D cg c]  }|j                    }}| j                  j                  |      }|D ci c]  }|j                   | }}|D cg c]  \  }}||   |f c}}S c c}w c c}w c c}}w )z
        Rank responses for a prompt using preference model.

        Args:
            prompt: The prompt
            responses: Responses to rank

        Returns:
            Ranked list of (response, score) tuples
        )rM   r  r  )	r^   rO   r  rR  r  
ranked_idsid_to_responser  r  s	            r5   rank_responsesz#PreferenceIntegrator.rank_responses,  sv     '0000ZZ++L9
+45a!$$'55?IJe$e,JJ	 1 6Js   A)A.A3c                   t        | j                        }d}|D ]a  }| j                  j                  |j                  j
                  |j                  j
                        }|dkD  |j                  dkD  k(  s]|dz  }c t        |      |r|t        |      z  ndt        t        d |D                    | j                  dS )zGet integration statistics.r   r   r$   c              3     K   | ]7  }|j                   j                  |j                  j                  fD ]  }|  9 y wr[   )rz   rM   r{   )r   r   r  s      r5   r   z6PreferenceIntegrator.get_statistics.<locals>.<genexpr>O  sD      ( OO..0B0BC( ((s   =?)r   model_agreementunique_responsesr  )r   rU  r  rq  rz   rM   r{   r   r   r  r  )r^   r   r   r   r  s        r5   r   z#PreferenceIntegrator.get_statisticsA  s    T\\"  	D::001C1CT__EWEWXDs


S 011	 u:7<wU3! #C ( %( % ! II
 	
r4   N)r   )r  rZ  rU  r  r  rU   )r  rK   ro   rU   )rz   rK   r{   rK   ro   zTuple[float, float])r   rp   r  rU   ro   r   )rO   rN   r  r  ro   zList[Tuple[Response, float]]rs   )
r(   r)   r*   r+   r   r  r  r  r  r   r3   r4   r5   r
  r
    s    " 	  # 	$
@>> > 
	>.  ++ + 
	+ZKK "K 
&	K*
r4   r
  c                 D	  K   t        d       t        d       t        d       t        d       t        dd      } t        d       t        d	      }t        d
       t        dd      }t        d       t	        |t
        j                        }t        d       t        |t        j                        }t        d       t        d       dg dfdg dfg}g }|D ]F  \  }}|D ]<  }	t        t        t        j                               ||	      }
|j                  |
       > H t        dt        |       d       t        d       g d}t!        dt        |      dz
  d      D ]  }|dd D ]  }| j#                  ||   j$                  ||   ||dz            }t        ||   j&                        d kD  rt(        j*                  nt(        j,                  }| j/                  |||t1        j2                  d!d"      t1        j2                  d#d$      %       d{   \  }}|s|s|j5                  |         t        d&t        |       d'       t        d(| j7                                 t        d)       t9        |      }|j;                  |d*+      }t        d,|d-   d.   d/   d0       t        d1       |j=                  |      }|j?                  |d23      }t        d4t        |       d5       |D ][  \  }}|jA                  |jB                  |jB                        }t        d6|j&                  dd   d7|j&                  dd   d8|d9d:       ] t        d;       tE        ||d<      }|jG                  d*3      }t        dt        |       d=       |dd D ]  }t        d>|d?   dd@  dA|dB   dCd:         |j7                         }t        dD|dE   d0       t        dF       t!        tI        d2t        |      dz
              D ]d  }||   ||dz      }}|jK                  |jB                  |jB                        }t        dG|j&                  ddH  dI|j&                  ddH  dJ|d0       f t        dK       t        dL       t        d       | ||||dMS 7 w)Nz+Demonstrate the preference learning system.zF======================================================================z0AIVA Queen - Preference Learning System for RLHFz
1. Initializing components...r   r   )r   r   z$   - PreferenceCollector initializedz:memory:rS  z"   - PreferenceDataset initializedr  )r`  ra  z"   - BradleyTerryModel initialized)r  r  z"   - PreferenceLearner initializedz&   - ActivePreferenceQuery initializedz"
2. Generating sample responses...zWhat is machine learning?)zKMachine learning is a subset of AI that enables systems to learn from data.z!ML is when computers learn stuff.zEMachine learning involves algorithms that improve through experience.zExplain quantum computing.)z>Quantum computing uses qubits that can exist in superposition.z"Quantum computers are really fast.z>Quantum computing leverages quantum mechanics for computation.r   z   Generated z
 responsesz
3. Collecting preferences...)annotator_1annotator_2annotator_3r   r$   r#   N)rO   rz   r{      r   r   r   <   )r   r~   r|   r   r   z   Collected z preference pairsz   Statistics: z 
4. Training preference model...r   )r  z   Final train accuracy: r  r%   r  z.2%z$
5. Selecting informative queries...r"   )r   z   Selected z queries for annotationz   - 'z	...' vs 'z...' (uncertainty: z.3f)z)
6. Integrating with training pipeline...r  z training pairsz   - Chosen: 'r  (   z...' (margin: r   z.2fz!   Model agreement with dataset: r"  z%
7. Testing preference predictions...z   P('   z...' > 'z...'): zG
======================================================================z(Preference Learning System Demo Complete)	collectorrU  r  learner
integrator)&printr   r  rZ  r  r>   r@   r  rD   rE   rK   rN   r   r   r   r   r  r   rO   rP   r!   r-   r1   r   r   uniformr$  r   r   r  r  r  r{  rM   r
  r  r   rq  )r-  rU  r  r.  query_selectorprompts_and_responsesall_responsesrO   textsrP   resp
annotatorsr  	annotatorcomp_idr?  acceptedr   r   r  r  r  r   r  r  r/  r  tpstatsry  s                                 r5   mainr=  [  s    	(O	
<=	(O 

+,#"%I 

01
3G	
./CEE	
./e6F6Q6QRG	
./*AZAZ[N	
23 

/0	$ '
 	
 
& (
 	 M. ' 	'Dtzz|$D
   &	'' 
M#m,-Z
89 

*+>J1c-(1,a0 "#BQ 	"I11$Q'..(+(Q/ 2 G 36mA6F6K6K2Lr2Q%..WiWrWrD#,#>#>%&!>>#s3#)>>"b#9 $? $ NHd DD!'	"", 
M#g,'8
9:	OI4467
89 

-.ME!!%!2G	%go&>r&B:&Ns%S
TU 

1288GJ**:*;H	LX'>
?@ a1++ADD!$$7qvvcr{m9QVVCR[M9L[Y\L]]^_`a
 

67%E7MJ22Q27N	M#n-.o
>?Ra  Ur(|CR018S?QQRSTU %%'E	-e4E.Fs-K
LM 

233q#m,q012 LQq1u!51''add3qvvcr{m8AFF3BK=SzJKL
 
/	
45	(O   ks   HR R	R "R %I9R __main__)Mr+   
__future__r   asynciohashlibr  loggingrh  osr  r   syspathr   elestio_configr   r  psycopg2.extrasr   r   ri   r   abcr   r   collectionsr   r   dataclassesr	   r
   r   r   r   enumr   r   pathlibr   typingr   r   r   r   r   r   r   r   r   r   r   r   basicConfigINFO	getLoggerr   r   rN   rL   r}   ry   r!   r7   r>   rD   rK   rx   r   r   r  rZ  r  r  r
  r=  r(   runr3   r4   r5   <module>rR     s  0 #      	   
 ; < )       # * 0 0 (         
,,A 
		4	5 CL
 $d $t D  %
 %
 %
P J
 J
 J
Z /
 /
 /
lx
 x
~	~ ~Jy y@a& a&Pb bRU
 U
xD zGKK r4   