
    i2                     8   d Z ddlZddlmc mZ ddlZddlZddl	Z	ddl
Z
ddlZddlmZ e
j                  j                  dd       d ZddZddZd Zd	 Zd
 Zd Zd Zd Zd Zd Zedk(  r~eeeeeeeegZdZdZeD ]  Z	  e        edz  Z  e"dd         e"de d e#e       d       er e"de d        e
jH                  d       y e"d        e
jH                  d       yy# e $ r'Z! e"dej6                   de!        edz  ZY dZ![!dZ![!ww xY w)u9  
Tests for Story 3.05 (Track B): MemGPTEscalation — Opus Escalation on 3-Strike

BB1: Escalation builds prompt with all failure details
BB2: Escalation uses ESCALATION_MODEL = "claude-opus-4-6"
BB3: Escalation event written to events.jsonl
BB4: No dispatch function -> returns stub escalated response

WB1: Failure history includes details from all attempts
WB2: On Opus failure (dispatch raises) -> error dict with MVFL_ESCALATION_FAILED
WB3: _build_history_string formats each attempt numbered
WB4: ESCALATION_SYSTEM_PROMPT contains "{failure_history}" placeholder
    N)Pathz/mnt/e/genesis-systemc                 H    t        j                         j                  |       S )z%Run an async coroutine synchronously.)asyncioget_event_looprun_until_complete)coros    6/mnt/e/genesis-system/tests/track_b/test_story_3_05.pyrunr
      s    !!#66t<<    c                     | ddS )NzDo the thing.)task_idprompt )r   s    r	   	make_taskr   "   s    /::r   c                     | d|dS )Nerrorr   statusreasonr   )r   r   s     r	   make_failed_outputr   &   s    'VDDr   c                  0   ddl m}  i fd} | |      }t        d      }ddddddd	dddd
dg}t        |j	                  ||d   |            }d}|v }|st        j                  d|fd|f      t        j                  |      dt        j                         v st        j                        rt        j                        nddz  }t        j                  d      dz   d|iz  }	t        t        j                  |	            dx}}d   }
d}||
v }|st        j                  d|fd||
f      t        j                  |      dt        j                         v st        j                  |
      rt        j                  |
      nddz  }t        j                  d|
       dz   d|iz  }	t        t        j                  |	            dx}}d	}||
v }|st        j                  d|fd||
f      t        j                  |      dt        j                         v st        j                  |
      rt        j                  |
      nddz  }t        j                  d|
       dz   d|iz  }	t        t        j                  |	            dx}}d
}||
v }|st        j                  d|fd||
f      t        j                  |      dt        j                         v st        j                  |
      rt        j                  |
      nddz  }t        j                  d|
       dz   d|iz  }	t        t        j                  |	            dx}}d}||
v }|st        j                  d|fd||
f      t        j                  |      dt        j                         v st        j                  |
      rt        j                  |
      nddz  }t        j                  d|
       dz   d|iz  }	t        t        j                  |	            dx}}t        d       y)zEBB1: The system prompt sent to Opus must contain all failure details.r   MemGPTEscalationc                 T   K   | d<   |d<   |d<   |j                  d      dddS w)Nmodelsystem_prompttask_payloadr   	completedfixedr   r   outputget)r   r   r   captureds      r	   capture_dispatchzBtest_bb1_prompt_contains_failure_details.<locals>.capture_dispatch4   s>     !$1!#/ '++I6+Y`aas   %(dispatch_fnzt-bb1r   zsyntax failurer   zsemantic mismatchzexternal rejection)failure_historyr   inz%(py1)s in %(py3)sr$   py1py3zdispatch_fn was not called
>assert %(py5)spy5Nspz%Expected 'syntax failure' in prompt:
z(Expected 'semantic mismatch' in prompt:
z)Expected 'external rejection' in prompt:
3z&Expected attempt count '3' in prompt:
u2   BB1 PASSED — prompt contains all failure details)core.mvfl.memgpt_escalationr   r   r
   escalate
@pytest_ar_call_reprcompare	_saferepr@py_builtinslocals_should_repr_global_name_format_assertmsgAssertionError_format_explanationprint)r   r%   esctaskhistoryresult@py_assert0@py_assert2@py_format4@py_format6r2   r$   s              @r	   (test_bb1_prompt_contains_failure_detailsrH   .   s   <Hb '7
8CWDw:JKw:MNw:NOG
 dGBKIJFD?h&DDD?hDDD?DDDDDDhDDDhDDDD(DDDDDDD	/	"B Pr!PPPrPPPPPPPPPrPPPrPPPP%KB4#PPPPPPPV"$VVV"VVVVVVVVV"VVV"VVVV(QRTQU&VVVVVVVX2%XXX2XXXXXXXXX2XXX2XXXX)STVSW'XXXXXXX D3"9DDD3"DDD3DDDDDD"DDD"DDDD?tDDDDDDD	
>?r   c                  Z   ddl m} m} d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      dz  }t        j                  d| d	      d
z   d|iz  }t        t        j                  |            dx}}i fd} | |      }t        |j                  t               t                            d   }d}	||	k(  }|st        j                  d|fd||	f      t        j                  |      t        j                  |	      dz  }
t        j                  dd          dz   d|
iz  }t        t        j                  |            dx}x}}	t!        d       y)z5BB2: The model dispatched to must be claude-opus-4-6.r   r   ESCALATION_MODELclaude-opus-4-6==)z%(py0)s == %(py3)srK   )py0r/   z1ESCALATION_MODEL must be 'claude-opus-4-6', got ''r0   r1   Nc                 @   K   | d<   |j                  d      dddS w)Nr   r   r   okr    r"   )r   r   r   dispatched_tos      r	   r%   z;test_bb2_escalation_model_is_opus.<locals>.capture_dispatch`   s*     !&g'++I6+Y]^^s   r&   r   z%(py1)s == %(py4)sr.   py4zDispatched to wrong model: 
>assert %(py6)spy6u'   BB2 PASSED — model is claude-opus-4-6)r4   r   rK   r6   r7   r9   r:   r;   r8   r<   r=   r>   r
   r5   r   r   r?   )r   rK   rE   @py_assert1rF   rG   r%   r@   rD   @py_assert3@py_format5@py_format7rS   s               @r	   !test_bb2_escalation_model_is_opusr]   V   s   N0 00  0               1    <<L;MQO     M_ '7
8CY["4"678! %6 !%66  !%6    "    &7    &mG&<%=>     

34r   c            	      D   t        j                         5 } t        |       dz  }ddlmc m} |j                  }||_        	 ddlm}  |d      }t        |j                  t        d      t        d                   |j                  } |       }|st        j                  d      dz   d	t        j                          v st        j"                  |      rt        j$                  |      nd	t        j$                  |      t        j$                  |      d
z  }t'        t        j(                  |            dx}}|j+                         j-                         j/                         }	t1        |	      }
d}|
|k\  }|st        j2                  d|fd|
|f      dt        j                          v st        j"                  t0              rt        j$                  t0              nddt        j                          v st        j"                  |	      rt        j$                  |	      ndt        j$                  |
      t        j$                  |      dz  }t        j                  d      dz   d|iz  }t'        t        j(                  |            dx}
x}}t5        j6                  |	d         }|d   }d}||k(  }
|
st        j2                  d|
fd||f      t        j$                  |      t        j$                  |      dz  }t        j                  d|d          dz   d|iz  }t'        t        j(                  |            dx}x}
}|d   }d}||k(  }
|
st        j2                  d|
fd||f      t        j$                  |      t        j$                  |      dz  }t        j                  d|d          dz   d|iz  }t'        t        j(                  |            dx}x}
}|d   }d }||k(  }
|
st        j2                  d|
fd||f      t        j$                  |      t        j$                  |      dz  }t        j                  d!|d          dz   d|iz  }t'        t        j(                  |            dx}x}
}d"}||v }
|
st        j2                  d#|
fd$||f      t        j$                  |      dt        j                          v st        j"                  |      rt        j$                  |      ndd%z  }t        j                  d&      d'z   d(|iz  }t'        t        j(                  |            dx}}
d)}||v }
|
st        j2                  d#|
fd$||f      t        j$                  |      dt        j                          v st        j"                  |      rt        j$                  |      ndd%z  }t        j                  d*      d'z   d(|iz  }t'        t        j(                  |            dx}}
||_        	 ddd       t9        d+       y# ||_        w xY w# 1 sw Y   !xY w),zABB3: An mvfl_escalation event must be appended to the events log.zevents.jsonlr   Nr   r&   zt-bb3zevents.jsonl was not createdzC
>assert %(py4)s
{%(py4)s = %(py2)s
{%(py2)s = %(py0)s.exists
}()
}log_path)rO   py2rV      )>=)z0%(py3)s
{%(py3)s = %(py0)s(%(py1)s)
} >= %(py6)slenlines)rO   r.   r/   rX   zNo events written to logz
>assert %(py8)spy8r(   eventmvfl_escalationrM   rT   rU   zWrong event type: rW   rX   r   Wrong task_id: r   rL   zWrong model in event: 	timestampr*   r,   r-   zMissing timestamp in eventr0   r1   attempt_countzMissing attempt_count in eventu7   BB3 PASSED — escalation event written to events.jsonl)tempfileTemporaryDirectoryr   r4   mvflmemgpt_escalationEVENTS_LOG_PATHr   r
   r5   r   r   existsr6   r<   r9   r:   r;   r8   r=   r>   	read_textstrip
splitlinesrc   r7   jsonloadsr?   )tmpdirr_   modoriginal_pathr   r@   rY   rZ   r[   rd   rE   @py_assert5@py_assert4r\   @py_format9rf   rD   rF   rG   s                      r	   !test_bb3_escalation_event_writtenr|   q   s   		$	$	& 0&<.0 	21++&	0D"t4CYw/1CG1LMN??D?$D$DD&DDDDDDD8DDD8DDD?DDD$DDDDDD&&(..0;;=Eu:>>:?>>>:>>>>>>3>>>3>>>>>>u>>>u>>>:>>>>>>$>>>>>>>>JJuRy)E>]%6]>%66]]]>%6]]]>]]]%6]]]:LUSZ^L\8]]]]]]]]#TwT#w.TTT#wTTT#TTTwTTT/%	BRAS0TTTTTTTT>a%6a>%66aaa>%6aaa>aaa%6aaa:PQVW^Q_P`8aaaaaaaaE;%'EEE;%EEE;EEEEEE%EEE%EEEE)EEEEEEE"M?e+MMM?eMMM?MMMMMMeMMMeMMMM-MMMMMMM"/C104 

CD #0C10 0s$   +XV-X
.X
	XXXc                  |   ddl m} m}  | d      }t        |j	                  t        d      t        d                  }|d   }d}||k(  }|st        j                  d|fd	||f      t        j                  |      t        j                  |      d
z  }t        j                  d|d          dz   d|iz  }t        t        j                  |            dx}x}}|d   }d}||k(  }|st        j                  d|fd	||f      t        j                  |      t        j                  |      d
z  }t        j                  d|d          dz   d|iz  }t        t        j                  |            dx}x}}|d   }||k(  }|st        j                  d|fd||f      t        j                  |      dt        j                         v st        j                  |      rt        j                  |      nddz  }	t        j                  d|d          dz   d|	iz  }
t        t        j                  |
            dx}}d}||v }|st        j                  d|fd||f      t        j                  |      dt        j                         v st        j                  |      rt        j                  |      nddz  }	t        j                  d      dz   d|	iz  }
t        t        j                  |
            dx}}d}||v }|st        j                  d|fd||f      t        j                  |      dt        j                         v st        j                  |      rt        j                  |      nddz  }	t        j                  d      dz   d|	iz  }
t        t        j                  |
            dx}}t!        d       y)zEBB4: Without a dispatch_fn, escalate() returns a stub escalated dict.r   rJ   Nr&   zt-bb4r   	escalatedrM   rT   rU   z"Expected status='escalated', got: rW   rX   r   rh   r   )z%(py1)s == %(py3)srK   r-   zWrong model in stub: r0   r1   r!   r*   r,   rC   z%Missing 'output' key in stub responserj   z,Missing 'attempt_count' key in stub responseu?   BB4 PASSED — stub response returned when no dispatch function)r4   r   rK   r
   r5   r   r   r6   r7   r8   r<   r=   r>   r9   r:   r;   r?   )r   rK   r@   rC   rD   rZ   rE   r[   r\   rF   rG   s              r	   !test_bb4_no_dispatch_returns_stubr      sR   N
t
,Ci02DW2MNOF(c{c{*ccc{cccccc{ccc.PQWX`QaPb,cccccccc)NN'NNNNNNNNNNNN?6);L:M)NNNNNNNN'?Y?..YYY?.YYY?YYYYYY.YYY.YYYY2GwGX0YYYYYYYF8vFFF8vFFF8FFFFFFvFFFvFFFFFFFFFFFT?f$TTT?fTTT?TTTTTTfTTTfTTTT&TTTTTTT	
KLr   c                  .	   ddl m}   |        }ddddddg}|j                  |      }d}||v }|st        j                  d	|fd
||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      nddz  }t        j                  d      dz   d|iz  }t        t        j                  |            dx}}d}||v }|st        j                  d	|fd
||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      nddz  }t        j                  d      dz   d|iz  }t        t        j                  |            dx}}d}||v }|st        j                  d	|fd
||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      nddz  }t        j                  d      dz   d|iz  }t        t        j                  |            dx}}d}||v }|st        j                  d	|fd
||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      nddz  }t        j                  d      dz   d|iz  }t        t        j                  |            dx}}d}||v }|st        j                  d	|fd
||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      nddz  }t        j                  d      dz   d|iz  }t        t        j                  |            dx}}d}||v }|st        j                  d	|fd
||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      nddz  }t        j                  d      dz   d|iz  }t        t        j                  |            dx}}t        d       y)zKWB1: _build_history_string must number each attempt and include all fields.r   r   alpha_errori  )r   code
beta_errori  --- Attempt 1 ---r*   r,   rC   r-   z"Missing '--- Attempt 1 ---' headerr0   r1   Nz--- Attempt 2 ---z"Missing '--- Attempt 2 ---' headerzMissing first attempt reasonzMissing second attempt reason500zMissing first attempt code422zMissing second attempt codeu6   WB1 PASSED — all attempts included in history string)r4   r   _build_history_stringr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   )r   r@   rB   rC   rD   rE   rF   rG   s           r	   'test_wb1_history_all_attempts_in_promptr      s   <

C #.#.G &&w/FN&(NNN&NNNNNNNNN&NNN&NNNN*NNNNNNNN&(NNN&NNNNNNNNN&NNN&NNNN*NNNNNNNB=F"BBB=FBBB=BBBBBBFBBBFBBBB$BBBBBBBC<6!CCC<6CCC<CCCCCC6CCC6CCCC$CCCCCCC85F?8885F8885888888F888F8888888888895F?9995F9995999999F999F99999999999	
BCr   c                     ddl m}  d } | |      }t        |j                  t	        d      t        d                  }|d   }d}||k(  }|st        j                  d|fd	||f      t        j                  |      t        j                  |      d
z  }t        j                  d|d          dz   d|iz  }t        t        j                  |            dx}x}}|d   }d}||k(  }|st        j                  d|fd	||f      t        j                  |      t        j                  |      d
z  }t        j                  d|d          dz   d|iz  }t        t        j                  |            dx}x}}d}||v }|st        j                  d|fd||f      t        j                  |      dt        j                         v st        j                  |      rt        j                  |      nddz  }	t        j                  d      dz   d|	iz  }
t        t        j                  |
            dx}}d}|d   }||v }|st        j                  d|fd||f      t        j                  |      t        j                  |      d
z  }t        j                  d|d          dz   d|iz  }t        t        j                  |            dx}x}}|d   }d}||k(  }|st        j                  d|fd	||f      t        j                  |      t        j                  |      d
z  }t        j                  d      dz   d|iz  }t        t        j                  |            dx}x}}t        d       y) zVWB2: If dispatch_fn raises, escalate() returns error dict with MVFL_ESCALATION_FAILED.r   r   c                     K   t        d      w)NNetwork timeout)RuntimeError)r   r   r   s      r	   failing_dispatchzHtest_wb2_dispatch_exception_returns_error_dict.<locals>.failing_dispatch   s     ,--s   r&   zt-wb2r   r   rM   rT   rU   zExpected status='error', got: rW   rX   NMVFL_ESCALATION_FAILEDz.Expected error='MVFL_ESCALATION_FAILED', got: error_detailr*   r,   rC   r-   z(Missing 'error_detail' in error responser0   r1   r   )z%(py1)s in %(py4)sz'Exception message not in error_detail: r   rL   z!Model not preserved in error dictuL   WB2 PASSED — dispatch exception produces MVFL_ESCALATION_FAILED error dict)r4   r   r
   r5   r   r   r6   r7   r8   r<   r=   r>   r9   r:   r;   r?   )r   r   r@   rC   rD   rZ   rE   r[   r\   rF   rG   s              r	   .test_wb2_dispatch_exception_returns_error_dictr      s   <. '7
8Ci02DW2MNOF([w[w&[[[w[[[[[[w[[[*HPXIYHZ([[[[[[[['? 6 ?66  ?6        7    98IJ     O>V#OOO>VOOO>OOOOOOVOOOVOOOO%OOOOOOO ~ 6  66   6        !7    2&2H1IJ     '?T/T?//TTT?/TTT?TTT/TTT1TTTTTTTT	
XYr   c                      ddl m}   |        }|j                  ddig      }|j                  }d} ||      }|st	        j
                  d|       dz   dt        j                         v st	        j                  |      rt	        j                  |      ndt	        j                  |      t	        j                  |      t	        j                  |      d	z  }t        t	        j                  |            d
x}x}}|j                  dg      }d}||v }	|	st	        j                  d|	fd||f      t	        j                  |      dt        j                         v st	        j                  |      rt	        j                  |      nddz  }
t	        j
                  d      dz   d|
iz  }t        t	        j                  |            d
x}}	d}||v }	|	st	        j                  d|	fd||f      t	        j                  |      dt        j                         v st	        j                  |      rt	        j                  |      nddz  }
t	        j
                  d      dz   d|
iz  }t        t	        j                  |            d
x}}	ddiddiddig}|j                  |      }t        dd      D ]  }d| d}||v }	|	st	        j                  d|	fd||f      t	        j                  |      dt        j                         v st	        j                  |      rt	        j                  |      nddz  }
t	        j
                  d| d|       dz   d|
iz  }t        t	        j                  |            d
x}}	 t        d       y
)zAWB3: Each attempt block is numbered sequentially starting from 1.r   r   kvr   z8First attempt must start with '--- Attempt 1 ---', got:
zN
>assert %(py6)s
{%(py6)s = %(py2)s
{%(py2)s = %(py0)s.startswith
}(%(py4)s)
}result_single)rO   r`   rV   rX   Nraw_error_messager*   r,   
result_rawr-   z'Missing numbered header for raw attemptr0   r1   z!Raw string not included in outputxra            z--- Attempt z ---result_threezMissing '--- Attempt z
 ---' in:
uL   WB3 PASSED — _build_history_string formats each attempt numbered correctly)r4   r   r   
startswithr6   r<   r9   r:   r;   r8   r=   r>   r7   ranger?   )r   r@   r   rY   rZ   ry   r\   r   rD   rE   rF   rG   three_historyr   is                  r	   &test_wb3_build_history_string_numberedr      s   <

C --Szl;M## $7 #$78 8   DM?S             $    %8    9     
 **,?+@AJW*,WWW*WWWWWWWWW*WWW*WWWW.WWWWWWWQ*,QQQ*QQQQQQQQQ*QQQ*QQQQ.QQQQQQQ 1XQx#q2M,,];L1a[ 
aS% 	
%5 	
 	
% 	
 	
 		 & 	
 	
	6	
 	
  *6 	
 	
 		 *6 	
 	
  $A3k,@	
 	
 	
 	
 	


 

XYr   c                  $   ddl m}  d}|| v }|st        j                  d|fd|| f      t        j                  |      dt        j                         v st        j                  |       rt        j                  |       nddz  }t        j                  d      d	z   d
|iz  }t        t        j                  |            dx}}d}|| v }|st        j                  d|fd|| f      t        j                  |      dt        j                         v st        j                  |       rt        j                  |       nddz  }t        j                  d      d	z   d
|iz  }t        t        j                  |            dx}}| j                  dd      }d}||v }|st        j                  d|fd||f      t        j                  |      dt        j                         v st        j                  |      rt        j                  |      nddz  }t        j                  d      d	z   d
|iz  }t        t        j                  |            dx}}d}||v }|st        j                  d|fd||f      t        j                  |      dt        j                         v st        j                  |      rt        j                  |      nddz  }t        j                  d      d	z   d
|iz  }t        t        j                  |            dx}}t        d       y)zQWB4: ESCALATION_SYSTEM_PROMPT must contain {failure_history} and {attempt_count}.r   )ESCALATION_SYSTEM_PROMPTz{failure_history}r*   r,   r   r-   z@ESCALATION_SYSTEM_PROMPT missing '{failure_history}' placeholderr0   r1   Nz{attempt_count}z>ESCALATION_SYSTEM_PROMPT missing '{attempt_count}' placeholderr   ztest history)rj   r)   renderedz0Rendered prompt does not include failure_historyr3   z.Rendered prompt does not include attempt_countuT   WB4 PASSED — ESCALATION_SYSTEM_PROMPT has correct placeholders and formats cleanly)r4   r   r6   r7   r8   r9   r:   r;   r<   r=   r>   formatr?   )r   rD   rE   rF   rG   r   s         r	   &test_wb4_system_prompt_has_placeholderr      s+   D "::  ":          #;    #;    	K       88   8          !9    !9    	I    
 (..QP^._HY>X%YYY>XYYY>YYYYYYXYYYXYYYY'YYYYYYYL3(?LLL3(LLL3LLLLLL(LLL(LLLLLLLLLLL	
`ar   __main__ra   zFAILED: u    — 
z2==================================================zStory 3.05 Test Results: /z passedz test(s)zALL TESTS PASSED)task-001)r   z
bad output)%__doc__builtinsr9   _pytest.assertion.rewrite	assertionrewriter6   r   rt   ossysrk   pathlibr   pathinsertr
   r   r   rH   r]   r|   r   r   r   r   r   __name__testspassedfailedt	Exceptionexcr?   rc   exitr   r   r	   <module>r      sj      	 
   * +=
;E!@P56EFM(D2Z6Z@b. z0)))/6..	E FF 	CaKF 
Bvh-	%fXQs5zl'
BC)* !; $  	HQZZLcU34aKF	s   C--D2DD