
    CiE                     .   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ  ej                  e      ZdZdZd	gez  Zd
ed<   dZdefdZdad Zdedee   fdZd0dedededefdZ	 	 d1dededee   dee   fdZ	 	 	 d2dee   dedee   dededefdZ 	 	 	 d3dedee   dededef
dZ!ed k(  rddl"Z"ddl#Z# e"jH                  d!"      Z%e%jM                  dd#$       e%jM                  d%edd&'       e%jM                  d(edd)'       e%jM                  d*dd+,       e%jO                         Z( e#jR                   e!e(jT                  e(jV                  e(jX                  e(jZ                  -            Z. e/ ej`                  e.d.e/             yy)4u	  
Quality Gate — KB Ingestion Pipeline Module 12
===============================================
Auto-generated quiz from KB chunks + RAG accuracy evaluation.

Stories:
  12.01 — generate_quiz:      Create Q&A pairs from random KB chunks via Gemini
  12.02 — evaluate_accuracy:  Run quiz against RAG pipeline and measure hit rate
  12.03 — run_quality_gate:   Full pipeline (generate → evaluate → report)

Usage:
    python3 -m core.kb.quality_gate hubspot
    python3 -m core.kb.quality_gate hubspot --questions 10 --threshold 0.9

Dependencies:
    from core.kb.qdrant_store import search_platform
    from core.kb.embedder import embed_text
    from core.rag_query import rag_query
    from core.kb.pg_store import get_connection
    import google.genai as genai
    N)Optional)search_platform)
embed_text)	rag_queryzgemini-2.0-flashi           gư>a<  You are a QA engineer generating a quiz to test a retrieval system.

Given the following knowledge-base chunk, generate ONE question whose answer is clearly contained within the chunk text.

Requirements:
- The question must be specific and answerable from the chunk alone.
- The answer must be a short phrase or sentence (not a list) extracted or directly inferable from the chunk.
- Output ONLY valid JSON in this exact format (no markdown, no extra text):
{{"question": "<your question here>", "answer": "<short answer here>"}}

Chunk title: {title}
Chunk text:
{text}
returnc                  v   t        j                  dd      } | sd}t         j                  j                  |      rrt	        |      5 }|D ]W  }|j                         }|j                  d      s%|j                  dd      d   j                         j                  d      }  n ddd       | S | S # 1 sw Y   | S xY w)	z9Load GEMINI_API_KEY from environment or secrets.env file.GEMINI_API_KEY z(/mnt/e/genesis-system/config/secrets.envzGEMINI_API_KEY==   z'"N)osgetenvpathexistsopenstrip
startswithsplit)keysecrets_pathfhlines       -/mnt/e/genesis-system/core/kb/quality_gate.py_load_api_keyr   D   s    
))$b
)CA77>>,'l# r D::<D'89"jja0399;AA%H	 J3J Js   'B.-5B..B8c                  ^    t         "ddlm}  t               }| j	                  |      a t         S )z'Return a singleton google.genai Client.Nr   )api_key)_genai_clientgoogle.genaigenair   Client)r    r   s     r   _get_genai_clientr"   V   s*     $/W5    chunkc                 D   t               }t        j                  | j                  dd      | j                  dd      dd       }	 |j                  j                  t        |g      }|j                  j                         }|j                  d      r*|j                  d      d	   }|j                  d
      r|dd }t        j                  |      }d|vsd|vrt        j                  d|dd        y|S # t        $ r }t        j                  d|       Y d}~yd}~ww xY w)z
    Call Gemini to produce a {question, answer} pair for a single chunk.

    Returns None on any error so the caller can skip gracefully.
    titler   textNi  )r&   r'   )modelcontentsz```r   json   questionanswerz0Gemini response missing question/answer keys: %s   z&generate_question_for_chunk failed: %s)r"   _QUIZ_PROMPT_TEMPLATEformatgetmodelsgenerate_contentGEMINI_MODELr'   r   r   r   r*   loadsloggerwarning	Exception)r$   clientpromptresponserawqaexcs          r   _generate_question_for_chunkr?   `   s     F"))ii$YYvr"5D) * F==11X 2 
 mm!!#>>% ))E"1%C~~f%!"gZZ_R82#5NNMsSWTWyY	 ?Es   B-C6 4C6 6	D?DDab	min_wordsc                 v   | j                         j                         }|j                         }|syt        t        |      |z
  dz         D ]  }dj	                  ||||z          }||v s y t        d |D              }|syt        |j                               }t        ||z        t        |      z  }	|	dk\  S )z
    Heuristic: return True if at least min_words consecutive words from `a`
    appear as a sub-sequence in `b` (case-insensitive).

    Used to decide whether a RAG result text contains the expected answer.
    Fr    Tc              3   >   K   | ]  }t        |      d kD  s|  yw)   N)len).0ws     r   	<genexpr>z _text_overlap.<locals>.<genexpr>   s     4Q!14s         ?)lowerr   rangerG   joinset)
r@   rA   rB   a_wordsb_loweriphrasea_uniqueb_wordsoverlap_ratios
             r   _text_overlaprW      s     ggiooGggiG3w<)+a/0 '!a)m45W
 4g44H'--/"G7*+c(m;MCr#      platformnum_questionscustomer_idc                 `   	 t        |  d      }t        || |dd      }|st        j                  d|        g S t        |t        |            }t        j                  ||      }g }|D ]n  }	t        |	      }
|
|j                  |
d   |
d	   |	j                  d
d      |	j                  dd      |	j                  dd      |	j                  dd      d       p t        j                  dt        |      ||        |S # t        $ r'}t        j                  d|       t        }Y d}~d}~ww xY w)aI  
    Generate quiz questions from ingested KB to test RAG accuracy.

    Steps:
    1. Get a broad set of chunks from Qdrant for the platform (up to 200).
    2. Randomly sample num_questions chunks (or all if fewer available).
    3. For each chunk, call Gemini to generate a question answerable from that chunk.
    4. Return list of dicts:
       {question, expected_answer, source_chunk_id, source_text, source_url}

    Chunks that fail Gemini generation are silently skipped.

    Args:
        platform:      The platform to build the quiz for (e.g., "hubspot").
        num_questions: Desired number of quiz items (actual may be lower if
                       fewer chunks exist or Gemini fails for some).
        customer_id:   Optional customer scope for multi-tenant isolation.

    Returns:
        List of quiz item dicts.
    z documentation knowledge basezEembed_text failed during quiz generation (%s), using discovery vectorNr.   r   )query_vectorrY   r[   top_kscore_thresholdz.generate_quiz: no chunks found for platform=%sr,   r-   idr   r'   
source_urlr&   )r,   expected_answersource_chunk_idsource_textra   source_titlez8generate_quiz: generated %d/%d questions for platform=%s)r   r8   r6   r7   _DISCOVERY_VECTORr   infominrG   randomsampler?   appendr1   )rY   rZ   r[   r]   r>   chunkssample_sizesampledquizr$   r=   s              r   generate_quizrp      s7   8)!XJ.K"LM !F DhO	 mS[1KmmFK0GD )%0::!(|$yyr2 99VR0))L"5!IIgr2
 		 KKBD	; KO  )^`cd()s   C= =	D-D((D-皙?ro   pass_thresholdr^   c                 4   | s|dddd|g dgdS d}g }| D ]  }|d   }|d   }	|j                  dd	      }
	 t        ||
      }d}d}|D ]]  }|dk(  r|j                  dd      }|j                  dd	      }|
r|r	|
|k(  rd} n'|j                  dd	      }|sNt        |	|      s[d} n |r|dz  }|j                  ||	|
|t        |d      d        t        |       }|dkD  r||z  nd}||k\  }g }|s`||z
  }|j                  d|dd|dd|dd       |dk  r|j                  d       |dk\  r|j                  d       |j                  d       |||t        |d      ||||dS # t        $ r'}t        j	                  d|dd |       g }Y d}~Pd}~ww xY w)u  
    Run quiz against the RAG pipeline and measure retrieval accuracy.

    For each quiz item:
    1. Call rag_query(question, platform, top_k=top_k).
    2. Mark as "correct" if any result's source_url matches the quiz source_url
       OR if any result's text has significant overlap with the expected_answer.

    Args:
        quiz:            Output from generate_quiz().
        platform:        Platform to evaluate.
        customer_id:     Optional customer scope (reserved for future use).
        pass_threshold:  Minimum accuracy to consider the gate passed (0.0–1.0).
        top_k:           Number of RAG results to check per question.

    Returns:
        {
            "platform":        str,
            "total_questions": int,
            "correct":         int,
            "accuracy":        float,
            "passed":          bool,
            "threshold":       float,
            "details":         list[dict],
            "recommendations": list[str],
        }
    r   r   Fu5   No quiz items provided — run generate_quiz() first.)rY   total_questionscorrectaccuracypassed	thresholddetailsrecommendationsr,   rb   ra   r   )r^   z$rag_query failed for question=%r: %sNP   scoreTr'   r   r+   )r,   rb   ra   found_in_top_ktop_result_scorez	Accuracy z.1%z is z below the z threshold.rK   zLow accuracy (<50%) suggests the platform KB may have very few indexed chunks. Re-run the ingestion pipeline to populate Qdrant.zUConsider increasing chunk overlap or reducing chunk size to improve retrieval recall.zQReview failed items in 'details' to identify poorly chunked or ambiguous content.)	r1   r   r8   r6   r7   rW   rk   roundrG   )ro   rY   r[   rr   r^   ru   ry   itemr,   rb   ra   resultsr>   foundr~   result
result_urlresult_texttotalrv   rw   rz   gaps                          r   evaluate_accuracyr      sD   D   ' WX	
 		
 GG +
#01XXlB/

	6G "% 	F3&#)::gs#;   L"5JjZ:-E !**VR0K}_kJ	  qLG .$# %&6 :
 	K+Z IE"'!)wH'F "$Ox'~T#c+nS=QQ\]	
 c>""D s?""g 	_	

  (A&#*	 	w  	NNA8CR=RUVG	s   E''	F0FFc                   K   t         j                  d| ||       t        | ||      }|s t         j                  d|        | dd|  ddS t	        || ||      }||d<   |d   rd	nd
|d<   t         j                  d| |d   dz  |d          |S w)u  
    Full quality gate pipeline: generate quiz → evaluate → return combined report.

    This is an async function so it can be awaited from async contexts
    (e.g., an orchestrator or FastAPI endpoint), but internally all work
    is synchronous — no I/O concurrency is introduced here.

    Args:
        platform:       Platform KB to evaluate.
        customer_id:    Optional customer scope.
        num_questions:  Number of quiz questions to generate.
        pass_threshold: Accuracy fraction required to pass (default 0.80).

    Returns:
        Combined report dict.  If no chunks found, returns a minimal
        NO_DATA report.
    zHrun_quality_gate: starting for platform=%s, questions=%d, threshold=%.2fuG   run_quality_gate: no chunks found for platform=%s — returning NO_DATANO_DATAzNo chunks found for platform 'z<'. Ensure KB ingestion has been run before the quality gate.)rY   statusmessagero   rw   PASSEDFAILEDr   z7run_quality_gate: platform=%s accuracy=%.1f%% status=%srv   d   )r6   rg   rp   r7   r   )rY   r[   rZ   rr   ro   r   s         r   run_quality_gater   o  s     . KKR-
 =+>D`bjk 7z BS S
 	
 tX{NKFF6N#)(#3xF8
KKA&$s*F8,< Ms   B
B__main__u7   KB Quality Gate — auto quiz + RAG accuracy evaluation)descriptionz$Platform to evaluate (e.g., hubspot))helpz--questionsz2Number of quiz questions to generate (default: 20))typedefaultr   z--thresholdu1   Accuracy pass threshold 0.0–1.0 (default: 0.80)z--customer-idz5Optional customer_id scope for multi-tenant isolation)r   r   )rY   r[   rZ   rr      )indentr   )r+   )rX   N)Nrq   rF   )NrX   rq   )1__doc__r*   loggingr   ri   typingr   core.kb.qdrant_storer   core.kb.embedderr   core.rag_queryr   	getLogger__name__r6   r4   _DISCOVERY_VECTOR_DIMrf   r/   strr   r   r"   dictr?   intboolrW   listrp   floatr   r   argparseasyncioArgumentParserparseradd_argument
parse_argsargsrunrY   r[   	questionsrx   reportprintdumps r#   r   <module>r      s  ,   	   0 ' $			8	$ "  E11  !  &s   $ @ S  S  S    > !%EEE #E 
$Z	E^ "& 
t* # 	
  
P "& 	//#/ / 	/
 
/l z$X$$MF 
)OP
CA   E4@   D   DW[[]]((..>>		
F 
*$**VAs
34= r#   