
    şi )              
       J   d Z ddlZddlZddlZddlZddlZddlZddlmZm	Z	 ddl
mZmZ  ej                  e      ZdZdZdZg dZd	efd
Zdad Zded	ee   fdZded	efdZdee   d	eee      fdZ	 	 ddee   deded	ee   fdZdZdZ	 	 ddede	e   ded	ee   fdZ y)a  
MODULE 4: Gemini Embedder
==========================
Converts Chunk objects into EmbeddedChunk objects with 3072-dim vectors
using the gemini-embedding-001 model.

Stories implemented:
  4.01 - Single Text Embedder
  4.02 - Batch Embedder with Rate Limiting
  4.03 - Embedding Cache (Redis)
  4.04 - Build Embedding Text Optimizer

Usage:
    from core.kb.embedder import embed_text, embed_batch, embed_with_cache, build_embedding_text
    N)ListOptional)ChunkEmbeddedChunkzgemini-embedding-001i   i@  )         returnc                  v   t        j                  dd      } | sd}t         j                  j                  |      rrt	        |      5 }|D ]W  }|j                         }|j                  d      s%|j                  dd      d   j                         j                  d      }  n ddd       | S | S # 1 sw Y   | S xY w)	z9Load GEMINI_API_KEY from environment or secrets.env file.GEMINI_API_KEY z(/mnt/e/genesis-system/config/secrets.envzGEMINI_API_KEY==r   z'"N)osgetenvpathexistsopenstrip
startswithsplit)keysecrets_pathfhlines       )/mnt/e/genesis-system/core/kb/embedder.py_load_api_keyr   1   s    
))$b
)CA77>>,'l# r D::<D'89"jja0399;AA%H	 J3J Js   'B.-5B..B8c                  ^    t         "ddlm}  t               }| j	                  |      a t         S )z'Return a singleton google.genai Client.r   )genai)api_key)_genai_clientgoogler   r   Client)google_genair   s     r   _get_genai_clientr$   D   s-     0/$++G+<    textc                 @   | r| j                         st        d      t               }d}t        dgt        z         D ]  \  }}|r:t
        j                  d|t        t              |       t        j                  |       	 |j                  j                  t        |       }t        |j                  d   j                        c S  t'        dt        t              d	z    d
|       # t         $ r,}t#        |      t%        fddD              r|}Y d}~ق d}~ww xY w)z
    Embed a single text string using Gemini gemini-embedding-001 model.

    Returns a 3072-dimensional float vector.

    Raises:
        ValueError: If text is empty.
    z+Cannot embed empty or whitespace-only text.Nr   z!embed_text: retry %d/%d after %dsmodelcontentsc              3   &   K   | ]  }|v  
 y wN .0codeerr_strs     r   	<genexpr>zembed_text.<locals>.<genexpr>o        Et47?E   429500503zembed_text failed after r   z attempts: )r   
ValueErrorr$   	enumerate_RETRY_DELAYSloggerwarninglentimesleepmodelsembed_contentEMBED_MODELlist
embeddingsvalues	ExceptionstranyRuntimeError)r&   clientlast_excattemptdelayresultexcr1   s          @r   
embed_textrQ   R   s    tzz|FGG F %)H#QC-$78 NN>]I[]bcJJu	]]00! 1 F ))!,3344& 
"3}#5#9":+hZP   	#hGE/DEE	s   AC((	D1!DDDchunkc                    | j                   rd| j                    dnd}| j                  r*| j                  j                         rd| j                   dnd}||fD cg c]  }|s|	 }}|rdj                  |      dz   }nd}| | j                   d| j
                   }t        j                  dd|      }t        j                  dd|      }t        |      t        kD  r	|d	t         }|S c c}w )
a-  
    Build optimised text for embedding.

    Format: [{platform}] [{heading_context}] {title}\n{text}

    - Heading context is included when present.
    - Total output is capped at MAX_EMBEDDING_CHARS (8000 chars).
    - Excessive whitespace (multiple spaces/newlines) is normalised to single.
    []r    
z[ \t]+z\n{3,}z

N)
platformheading_contextr   jointitler&   resubr>   MAX_EMBEDDING_CHARS)rR   platform_tagheading_tagpprefix_partsprefixraws          r   build_embedding_textre   ~   s     -2NNQu~~&a(L272G2GELaLaLgLgLiAe++,A.oqK ,k:@!aA@L@,'#-HU[[MEJJ<
0C &&C
%C
&&FC
(C 3x%%&&'J! As   C,%C,textsc                    t               }d}t        dgt        z         D ]  \  }}|r,t        j	                  d||       t        j                  |       	 |j                  j                  t        |       }|j                  D cg c]  }t        |j                         c}c S  t!        d|       c c}w # t        $ r,}t        |      t        fddD              r|}Y d}~Ȃ d}~ww xY w)zEmbed multiple texts in a single API call using Gemini's batch endpoint.

    Returns list of vectors (same order as input texts).
    Retries on transient errors (429, 500, 503).
    Nr   z&_embed_texts_batch: retry %d after %dsr(   c              3   &   K   | ]  }|v  
 y wr,   r-   r.   s     r   r2   z%_embed_texts_batch.<locals>.<genexpr>   r3   r4   r5   z)_embed_texts_batch failed after retries: )r$   r:   r;   r<   r=   r?   r@   rA   rB   rC   rE   rD   rF   rG   rH   rI   rJ   )	rf   rK   rL   rM   rN   rO   erP   r1   s	           @r   _embed_texts_batchrj      s      F$(H#QC-$78 NNCWeTJJu	]]00! 1 F -3,=,=>qDN>>  B8*M
NN ? 	#hGE/DEE	s0   /B:B5!B:5B::	C/!C*)C**C/chunks
batch_sizemax_rpmc           	         g }t        |       }g }t        d||      D ](  }| |||z    }t        j                         }|D 	cg c]  }	||	z
  dk  s|	 }}	t        |      |k\  r>d||d   z
  z
  dz   }
|
dkD  r+t        j                  d|
       t        j                  |
       |D cg c]  }t        |       }}	 t        |      }|j                  t        j                                t        ||      D ])  \  }}|	|j                  t        ||t                     + t!        ||z   |      }t        j                  d	||       + |S c c}	w c c}w # t        $ r}t        j                  d|       g }|D ]Z  }	 |j                  t        |             # t        $ r1}t        j                  d|       |j                  d       Y d}~Td}~ww xY w Y d}~d}~ww xY w)
ah  
    Embed multiple chunks with true API batching, rate limiting, and progress.

    - Sends up to batch_size texts per API call (massive speedup vs 1-at-a-time).
    - Rate-limits to max_rpm requests per minute.
    - Retries on transient errors (429, 500, 503) with exponential backoff.
    - Returns list of EmbeddedChunk (chunk + vector + model name).
    r   g      N@g?zRate limit: sleeping %.1fsz4Batch embed failed (%s), falling back to 1-at-a-timezSingle embed failed: %sN)rR   vectorembedding_modelzEmbedded %d/%d chunks...)r>   ranger?   	monotonicr<   infor@   re   rj   rG   r=   appendrQ   zipr   rC   min)rk   rl   rm   embeddedtotalrequest_timestampsbatch_startbatch_chunksnowt	sleep_forcrf   vectorsrP   r&   
single_excrR   ro   dones                       r   embed_batchr      s    %'HKE&(Qz2 %=kK*,DE nn);NAsQw~aNN!"g-&8&; ;<sBI1}8)D

9% 3??Q%a(??	)(/G 	!!$.."23 w7 	ME6!!fkZ	 ;+U3.e<K%=N OE O @  		)NNQSVWG ))NN:d#34  )NN#<jINN4(())			)sT   EE%E:E	G G<FG	G	 'G	GG	GG i:	 zgenesis:kb:embed:	cache_key	redis_urlc                     ddl }|s2t        j                  | j                  d            j	                         }t
         | }d}	 |j                  |dd      }|j                          |C	 |j                  |      }|r.t        j                  d|dd        t        j                  |      S 	 t        |       }|K	 |j!                  |t"        t        j$                  |             t        j                  d
|dd t"               |S |S # t        $ r"}t        j                  d|       d}Y d}~d}~ww xY w# t        $ r }t        j                  d	|       Y d}~d}~ww xY w# t        $ r!}t        j                  d|       Y d}~|S d}~ww xY w)aG  
    Embed text, caching result in Redis to avoid re-embedding unchanged content.

    - Cache key = SHA-256 of text content (or provided cache_key).
    - Redis stores the embedding as a JSON list of floats.
    - 7-day TTL on each cache entry.
    - If Redis is unavailable, falls back to direct embed_text() (no crash).
    r   Nzutf-8r   )socket_connect_timeoutsocket_timeoutu7   Redis unavailable (%s) — falling back to direct embedzCache HIT for key %s   u(   Redis GET failed (%s) — skipping cachezCache SET for key %s (TTL=%ds)u2   Redis SET failed (%s) — continuing without cache)redishashlibsha256encode	hexdigest_CACHE_PREFIXfrom_urlpingrG   r<   r=   getdebugjsonloadsrQ   setex
_CACHE_TTLdumps)	r&   r   r   	redis_libfull_keyredis_clientrP   cachedro   s	            r   embed_with_cacher      s     NN4;;w#78BBD	,HL )))A^_)` 	L!%%h/F3Ys^Dzz&))  F 	VxTZZ5GHLL99Sb>:V M6M3  PRUV  	LNNEsKK	L  	VNNOQTUUM	VsJ   $D	 -A D7 =AE# 		D4D//D47	E  EE #	F,FF)2   ix  )NzOredis://default:e2ZyYYr4oWRdASI2CaLc-@redis-genesis-u50607.vm.elestio.app:26379)!__doc__r   r   loggingr   r\   r?   typingr   r   core.kb.contractsr   r   	getLogger__name__r<   rC   
VECTOR_DIMr^   r;   rH   r   r    r$   floatrQ   re   rj   intr   r   r   r   r-   r%   r   <module>r      s6       	 	  ! 2 
		8	$
 %
 s   %S %T%[ %X # HOd3i ODe,= O: 8K88 8 
-	8~ 
#
  $f1
1}1 1 
%[	1r%   