
    ϟip                    .   d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	m
Z
mZmZmZmZ ddlmZmZmZmZmZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	l m!Z!m"Z" dd
l#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0  ejb                  e2      Z3	 	 d	 	 	 	 	 	 	 	 	 	 	 ddZ4	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 ddZ5	 d	 	 	 	 	 	 	 ddZ6ddZ7ddZ8d dZ9d!dZ:d"d#dZ;d!dZ<e2dk(  r0 ejz                  ej|                  ej~                          e<        yy)$u  
KB Ingestion Orchestrator — MODULE 8
======================================
Wires M1-M7 together into a complete pipeline:
  fetch → extract → chunk → embed → store

Stories implemented:
  8.01 — ingest_platform() — Full 10-step pipeline
  8.02 — ingest_url()      — Single URL ingestion
  8.03 — CLI Interface (argparse, subcommands: ingest / ingest-url / status / list)
  8.04 — Progress Reporting (_report_progress)
  8.05 — Error Recovery (page-level and chunk-level failures don't crash the pipeline)

Usage (CLI):
    python3 -m core.kb.orchestrator ingest telnyx
    python3 -m core.kb.orchestrator ingest hubspot --customer-id cust_001 --max-pages 50
    python3 -m core.kb.orchestrator ingest-url https://docs.example.com/page --platform hubspot
    python3 -m core.kb.orchestrator status telnyx
    python3 -m core.kb.orchestrator list

Progress is written to stderr; JSON stats are written to stdout.
    )annotationsN)AnyCallableDictListOptional)ChunkEmbeddedChunkExtractedContentFetchedPagePlatformConfig)get_platformlist_platforms)compute_content_hashfetch_pagesfetch_sitemapfilter_unchangedfilter_urls)extract_batch)chunk_batch
tag_chunks)embed_batchbuild_embedding_text)upsert_vectors)ensure_schemaget_connectionget_content_hashesget_ingestion_historylog_ingestion_completelog_ingestion_startupsert_pageupsert_pages_batchc           	     "   |dkD  r!t        d|z  |z        }d|  d| d| d| d	}n	d|  d| d}|r| d	| }t        |t        j                  d
       |	  || |||       yy# t        $ r }t
        j                  d|       Y d}~yd}~ww xY w)a  Report pipeline progress to stderr and optionally to a callback.

    Safe against zero-total (no division error).

    Parameters
    ----------
    step:     Short label for the current pipeline step (e.g. "fetch").
    current:  Items completed so far.
    total:    Total items (0 is safe).
    message:  Optional free-form detail appended to the output line.
    callback: Optional callable(step, current, total, message) for consumers.
    r   d   [z] /z (z%)z/0u    — TfileflushNzprogress_callback raised: %s)intprintsysstderr	Exceptionloggerwarning)stepcurrenttotalmessagecallbackpctlineexcs           -/mnt/e/genesis-system/core/kb/orchestrator.py_report_progressr:   H   s    & qy#-%'(4&7)1UG2cU"54&7)2&uWI&	$SZZt,	@T7E73   	@NN93??	@s   A% %	B.B		Bc           
       K   t        j                          }t        |       }|t        d|  dt                      t	        dddd|j
                   d|       |j
                  |dddddg d	d
	}d}d}		 t               }t        |       t        ||j
                  |      }	g }
|j                  r8t	        ddd|j                  |       	 t        |j                         d{   }
n|j                   g}
t	        dddt#        |
       d|       |j$                  s|j&                  r!t)        |
|j$                  |j&                        }
||n|j*                  }t#        |
      |kD  r|
d| }
t	        dt#        |
      t#        |
      t#        |
       d|       i }|s	 t-        ||j
                  |      }|r|st#        |
      }t/        |j1                               }|
D cg c]	  }||vs| }
}|t#        |
      z
  }||d<   |rBt        j3                  d|       ddl}t7        d| dt#        |
       d|j8                  d       t	        ddt#        |
      d|       g }	 t;        |
d       d{   }g }|D ]_  }|j>                  d!k(  r|j                  |       $|dxx   dz  cc<   |d   j                  |j@                  dd"|j>                   d       a t#        |      |d#<   t	        dt#        |      t#        |
      t#        |       d$|       |r|s	 tC        ||       d{   }n|}t#        |      t#        |      z
  }|jE                  dd      |z   |d<   t	        d&t#        |      t#        |      | d'|       |s1tG        ||       |	tI        ||	|       ||	 |jK                          S S t	        d(dt#        |      d)|       tM        |      }|D cg c]  }||	 }}tO        ||      D ]7  \  }}|	|dxx   dz  cc<   |d   j                  |j@                  d(d*d       9 t	        d(t#        |      t#        |      t#        |       d+|       t	        d,dt#        |      d-|       g }	 tQ        |||.      }t	        d,t#        |      t#        |      t#        |       d0|       tS        ||j
                  |      }t	        d1dt#        |      d2|       	 tU        |d34      }t	        d1t#        |      t#        |      t#        |       d6|       t#        |      |d7<   t	        d8ddd9|       d}|r	 tW        |      }||d<<   i } i }!|D ]I  }"|"jX                  jZ                  }#| jE                  |#d      dz   | |#<   |"jX                  j\                  |!|#<   K 	 |jK                          t               }g }$|D ]{  }| jE                  |j@                  d      }%|!jE                  |j@                  d=      }&t_        |j`                        }'|$j                  |j
                  |j@                  |&|'|%|df       } 	 tc        ||$       t	        d8dd| dA|       tG        ||       |	tI        ||	|       t	        dBdddC|dD   dEdF|       ||	 |jK                          S S 7 # t        $ rd}t        j                  d|j                  |       |dxx   dz  cc<   |d   j                  |j                  dt        |      d       Y d}~!d}~ww xY w# t        $ r!}t        j                  d|       Y d}~d}~ww xY wc c}w 7 # t        $ rO}t        j=                  d|       |dxx   dz  cc<   |d   j                  d dt        |      d       Y d}~.d}~ww xY w7 # t        $ r#}t        j                  d%|       |}Y d}~d}~ww xY w# t        $ r Y S w xY wc c}w # t        $ rO}t        j=                  d/|       |dxx   dz  cc<   |d   j                  d d,t        |      d       Y d}~d}~ww xY w# t        $ rQ}t        j=                  d5|       g }|dxx   dz  cc<   |d   j                  d d1t        |      d       Y d}~d}~ww xY w# t        $ rO}t        j=                  d:|       |dxx   dz  cc<   |d   j                  d d;t        |      d       Y d}~d}~ww xY w# t        $ r Y [w xY w# t        $ r}t        j                  d>|       	 |jK                          n# t        $ r Y nw xY w	 t               }tc        ||$       nZ# t        $ rN}(t        j                  d?|(       |dxx   dz  cc<   |d   j                  d d@t        |(      d       Y d}(~(nd}(~(ww xY wY d}~~d}~ww xY w# t        $ r Y S w xY w# t        $ rV}tG        ||       |dxx   dz  cc<   |d   j                  dGdHt        |      d       |	|dI|dJ<   tI        ||	|        d}~ww xY w# |!	 |jK                          w # t        $ r Y w w xY ww xY ww)KuY  
    Full ingestion pipeline for a registered platform.

    Steps
    -----
    1.  Load platform config from registry (raises ValueError for unknown).
    2.  Fetch sitemap → list of candidate URLs.
    3.  Apply include/exclude URL filters.
    4.  Apply max_pages cap if specified.
    5.  Load content hashes from PG (skip if force_refresh=True).
    6.  Fetch pages concurrently (aiohttp).
    7.  Filter unchanged pages (compare content hashes).
    8.  Extract content (BeautifulSoup).
    9.  Chunk with heading context.
    10. Tag chunks with platform + customer_id.
    11. Embed with Gemini (batch).
    12. Upsert to Qdrant + PG metadata.
    13. Log ingestion run to PG.

    Returns
    -------
    dict with keys:
        pages_fetched, pages_skipped, chunks_created,
        vectors_upserted, errors, error_details, duration_seconds
    NUnknown platform: ''. Available: init   z
Platform 'z' loadedr           )	platformcustomer_idpages_fetchedpages_skippedchunks_createdvectors_upsertederrorserror_detailsduration_secondssitemapzSitemap fetch failed for %s: %srG   rH   urlr1   errorz URLs foundfilterz after filter/capz!Could not load content hashes: %srD   z,Pre-filter: skipped %d already-ingested URLsz[prefetch-skip] u!    URLs already in PG — fetching z newTr'   fetchzstarting fetch   concurrencyzfetch_pages failed: %sbatch   HTTP rC   z successfulzfilter_unchanged failed: %sdedupz skipped (unchanged)extractzextracting contentextraction returned Nonez
 extractedchunkchunkingrB   zchunk_batch failed: %sz chunks createdembed	embedding2   
batch_sizezembed_batch failed: %sz	 embeddedrE   storezupserting to Qdrantzupsert_vectors failed: %sqdrant_upsertrF    z5Batch PG upsert failed (%s), retrying with fresh connz Batch PG upsert retry failed: %s	pg_upsertz vectors upserteddonezComplete in rI   z.1fspipelineorchestratorfailedstatus)2timer   
ValueErrorr   r:   namer   r   r    sitemap_urlr   r.   r/   r0   appendstrdocs_base_urllenurl_patternsexclude_patternsr   	max_pagesr   setkeysinfor,   r+   r-   r   rM   status_coderL   r   get_finalize_stats_log_complete_safecloser   zipr   r   r   r   rY   
source_urltitler   htmlr"   ))rA   rB   ru   force_refreshprogress_callbackt_startconfigstatsconnlog_idcandidate_urlsr8   effective_maxknown_hashespre_filter_count
known_urlsuskipped_prefetchr,   fetched_pages
good_pagespagechanged_pagescontent_skippedextracted_rawe	extractedresultchunkstagged_chunksembeddedrF   url_chunk_count	url_titleecrL   page_tupleschunk_countr   content_hash	retry_excs)                                            r9   ingest_platformr   q   s
    @ iikG (4H'=F~.xj 9''5'7&8: ; 	; VQZ}H#EGXY KK"
E D F{d$T6;;D %'Y1f.@.@BST'4V5G5G'H!H %223NAqS-@,A*MO`a &"9"9(##''N &/%:	@P@P~.+N];N3~#6N8K/00ABDU	W (*I1$[Q
 ">2\..01J)7OA1J;NaONO/#n2EE%5E/"JL\]&'7&88YZ]^lZmYnnrs4 	!S%8:JL]^+-	"-n!"LLM )+
! 	D3&!!$'h1$o&-- HHg %d&6&6%78:	 "%Zo#m"4c.6I
O,K8:K	M +&6z<&P P
 'Mj/C,>>!&?A!>!Po#m"4c*o+,,@ACT	V E7+!"47T 

 O 	As='9;OQbc%m48E,W1Q,W	,W}= 	LD&~h1$o&-- HHiB\]	 	C	NC4F	N+:68I	K 	!S^ZARS 	 FLF 	#f+s6{K=8:K	M #66;;D 	!S%7FWX	"=R@H 	#h-]1CM?)46G	I #&h- 	!Q(=?PQ#1(#;  %5 ! +-$&	 	,B((%%C#2#6#6sA#>#BOC XX^^IcN	,	JJL  ! 	D)--dhh:KMM$((B/E/		:LTXXul[$  			t[1$ 	!Q,-->?AR	T 	w'tVU3A'.@(A#'FaHJ[	] 

 [ "I @&BTBTVYZh1$o&--"..	CPSHU B  IBCHHI P M 	LL137(Oq O/"))3s8D 	4 !Q +<cB *+x  Q -X"  	LL137(Oq O/"))3s8D 	"  	LL137H(Oq O/"))3s8D 		&  8#>h1$o&--#_s3xP ,  		"  	NNRTWX

 %'"45 A9Mh1$o&--#[3y>R 	X    	w'h1o%%SJ	
 $"2&E(OtVU3	 

  s  A2i/5Ag% 	Z- !Z*"Z- &B6g% \ 4,g%  	]
*]
.A3g% "] 2]3] 7B%g% ^0 ,^--^0 1A4g% %i/(_8i/:'g% !_.)_.-g% A8g% =_3 Ag% (a 5Ag% b+ Ag% 'd 7Bg% d Ag% i/g(i/*Z- -	\6A\g% \g% 	]&]<g% ]g% ] 	^*A^%g% %^**g% -^0 0	_9_g% _g% 	_+(i/*_++i/.g% 3	a<Aa g% ag% 	b(Ab#g% #b((g% +	d4Ac>8g% >dg% 	dg% dg% 	gg6eg	egege.-g.	g7Ag ;g ggg% gg% 	g"i/!g""i/%	i.Ah??ii i,ii,	i(%i,'i((i,,i/c                  K   t        j                          }t        |      }|t        d| dt                      |j                  || dddddg dd
}d}	 t               }t        |       t        | gd       d{   }|sG|d	xx   dz  cc<   |d
   j                  | ddd       t        ||       ||	 |j                          S S |d   }|j                  dk7  rT|d	xx   dz  cc<   |d
   j                  | dd|j                   d       t        ||       ||	 |j                          S S d|d<   t        |g      }	|	D 
cg c]  }
|
|
	 }}
|sG|d	xx   dz  cc<   |d
   j                  | ddd       t        ||       ||	 |j                          S S t        |||      }t        ||j                  |      }	 t!        |d      }t)        |      |d<   |rx	 t+        |      |d<   	 |j                          t               }t-        |j.                        }|r|d   j0                  nd}	 t3        ||j                  | ||t)        |      |       |	 |j                          t        ||       |S 7 # t        $ r Y S w xY w# t        $ r Y S w xY wc c}
w # t        $ r Y S w xY w# t        $ rQ}g }|d	xx   dz  cc<   |d
   j                  | dt#        |      d       t$        j'                  d|       Y d}~3d}~ww xY w# t        $ r9}|d	xx   dz  cc<   |d
   j                  | dt#        |      d       Y d}~Yd}~ww xY w# t        $ r Y \w xY w# t        $ rO}t$        j'                  d|       |d	xx   dz  cc<   |d
   j                  | dt#        |      d       Y d}~[d}~ww xY w# t        $ r Y \w xY w# |!	 |j                          w # t        $ r Y w w xY ww xY ww)u  
    Ingest a single URL into the KB.

    Runs the full pipeline (fetch → extract → chunk → embed → store) for
    exactly one URL.  Does NOT check content hashes (always re-ingests).

    Returns
    -------
    dict with the same keys as ingest_platform().
    Nr<   r=   r   r@   )
rA   rB   rL   rC   rD   rE   rF   rG   rH   rI   r?   rQ   rG   rH   rO   zNo pages returnedrK   rT   rU   rC   rW   rX   r[   r^   r_   r\   zEmbed failed: %srE   rF   rb   rc   zupsert_page failed: %srd   )rk   r   rl   r   rm   r   r   r   ro   r{   r}   r.   ry   r   r   r   r   rp   r/   r0   rr   r   r   r   r   r!   )rL   rA   rB   r   r   r   r   pagesr   r   r   r   r   r   r   r8   r   r   s                     r9   
ingest_urlr     s     iikG'3H'=F~.xj 9''5'7&8: ; 	; KK"E D_d "3%Q77(Oq O/"))W7JK E7+^ 

 [ Qxs"(Oq O/"))W!$"2"2!346 E7+J 

 G "#o &tf- -?1Q?	?(Oq O/"))Y9ST E7+p 

 k YKH"66;;D	4"=R@H #&h- ,:8,D()

 "#D/		:L*3IaL&&EKK M +" 

 E7#L{ 8r  9 E @D  e  	4H(Oq O/"))Ws3x@ NN-s33	4  h1$o&--3s8L      7=h1$o&--s3xH    

  s  AO5&O =I;>8O 6O59I>	O5AO O5!J1O53O JJ6O 
O5J!O5%O J0 O #L 2M 2O 5%M" O5N= -O5;O >	J
O5	J

O5	JO5JO5O !	J-*O5,J--O50	L
9AL?O L

O 	M.M
O 
MO 	MO MO "	N:+AN5/O 5N::O =	O
O5	O

O5O2O"!O2"	O.+O2-O..O22O5c                `    t        t        j                         |z
  d      | d<   d| vrd| d<   yy)z;Stamp duration and default status onto stats dict in-place.   rI   rj   	completedN)roundrk   )r   r   s     r9   r{   r{   9  s5     %diikG&;Q ?E
u%h     c                x    	 t        | ||       y# t        $ r }t        j                  d|       Y d}~yd}~ww xY w)z8Call log_ingestion_complete(), swallowing any exception.z!log_ingestion_complete failed: %sN)r   r.   r/   r0   )r   r   r   r8   s       r9   r|   r|   @  s8    AtVU3 A:C@@As    	949c                 h   t        j                  dd      } | j                  dd      }|j                  dd	       |j                  d
d	      }|j	                  dd	       |j	                  ddd       |j	                  dt
        dd       |j	                  ddd       |j                  dd	      }|j	                  dd	       |j	                  ddd        |j	                  ddd       |j                  d!d"	      }|j	                  dd#	       |j	                  d$t
        d%d&       | S )'z)Build and return the CLI argument parser.zpython3 -m core.kb.orchestratorz!Genesis KB Ingestion Orchestrator)progdescriptioncommandCOMMAND)destmetavarlistzList all registered platforms)helpingestzIngest a full platform KBrA   z$Platform name (e.g. hubspot, telnyx)z--customer-idNz/Optional customer ID for multi-tenant isolation)defaultr   z--max-pageszLimit number of pages to ingest)typer   r   z--force-refresh
store_truez0Re-ingest all pages, ignoring content hash cache)actionr   
ingest-urlzIngest a single URL into the KBrL   zFull URL to ingestz
--platformTz"Platform name for metadata tagging)requiredr   rj   z%Show ingestion history for a platformzPlatform namez--limit
   z+Number of recent runs to show (default: 10))argparseArgumentParseradd_subparsers
add_parseradd_argumentr*   )parsersubp_ingestp_urlp_statuss        r9   _build_parserr   L  sj   $$.7F 

Y	

BC NN6 ?N@ ~~h-H~IH*+QR/4P  R-c4@  B+LQ  S NN<.ONPE	u#78	|d@  B	M  O ~~h-T~UH*?;)#rL  N Mr   c                     t               } | st        dt        j                         yt        dt	        |        dt        j                         | D ]  }t        d|         y)z$Print all registered platform names.zNo platforms registered.r(   NzRegistered platforms (z):z  )r   r+   r,   r-   rr   )	platformsrm   s     r9   	_cli_listr   r  sY     I(szz:	"3y>"2"
5CJJG 4&kr   c                   t        |       }|4t        d|  dt        j                         t        j                  d       d}	 t               }t        ||j                  |      }|r|j                          	 |s t        d|  dt        j                         yt        d|j                   d	t        |       d
t        j                         |D ]  }t        d|j                  dd       d|j                  d       d|j                  d       d|j                  d       d|j                  d       d|j                  d       t        j                          y# |r|j                          w w xY w)z'Print ingestion history for a platform.Nr<   'r   r?   )limitzNo ingestion history for 'zIngestion history for 'z' (last z runs):z  [
started_at?z	] status=rj   z pages=rC   z chunks=rE   z	 vectors=rF   z errors=rG   )r   r+   r,   r-   exitr   r   rm   r}   display_namerr   rz   )rA   r   r   r   historyruns         r9   _cli_statusr   }  sa   (#F~#H:Q/cjjAD'fkkGJJL*8*A6SZZH	#F$7$7#8Wg
Vzz 	
#'',,- .ggh'( )WW_-. /gg./0 1ww123 4ggh'(* 	
	
 JJL s   "E! !E6c                    t               } | j                         }|j                  4| j                  t        j
                         t	        j                  d       |j                  dk(  rt                y|j                  dk(  rq	 t        j                  t        |j                  |j                  |j                  |j                              }t        t!        j"                  |dt$                     y|j                  d
k(  rf	 t        j                  t)        |j*                  |j                  |j                              }t        t!        j"                  |dt$                     y|j                  dk(  r"t-        |j                  t/        |dd             yt        d|j                   t        j
                  	       t	        j                  d       y# t&        $ r=}t        d| t        j
                  	       t	        j                  d       Y d}~yd}~ww xY w# t&        $ r=}t        d| t        j
                  	       t	        j                  d       Y d}~yd}~ww xY w)zCLI entry point.Nr?   r   r   )rA   rB   ru   r      )indentr   zError: r   r   )rL   rA   rB   rj   r   r   zUnknown command: )r   
parse_argsr   
print_helpr,   r-   r   r   asyncior   r   rA   rB   ru   r   r+   jsondumpsrp   rl   r   rL   r   getattr)r   argsr   r8   s       r9   _mainr     s   _FD||#**%||v		!	KK!]] $ 0 0"nn"&"4"4	E $**U1c:;
 
	%	KK!]] $ 0 0E $**U1c:;
 
	!DMM74"#=> 	!$,,0szzB/  	GC5/

3HHQKK	  	GC5/

3HHQKK	s2   A/G A$H" 	H"3HH"	I(+3I##I(__main__)levelstream)rc   N)r1   rp   r2   r*   r3   r*   r4   rp   r5   Optional[Callable]returnNone)NNFN)rA   rp   rB   Optional[str]ru   zOptional[int]r   boolr   r   r   Dict[str, Any])N)rL   rp   rA   rp   rB   r   r   r   )r   r   r   floatr   r   )r   r*   r   r   r   r   )r   zargparse.ArgumentParser)r   r   )r   )rA   rp   r   r*   r   r   )@__doc__
__future__r   r   r   r   loggingr,   rk   typingr   r   r   r   r   core.kb.contractsr	   r
   r   r   r   core.kb.platform_registryr   r   core.kb.fetcherr   r   r   r   r   core.kb.extractorr   core.kb.chunkerr   r   core.kb.embedderr   r   core.kb.qdrant_storer   core.kb.pg_storer   r   r   r   r   r    r!   r"   	getLogger__name__r/   r:   r   r   r{   r|   r   r   r   r   basicConfigINFOr-    r   r9   <module>r      s  . #     
  6 6  C  , 3 > /	 	 	 
		8	$ #'"@
"@"@ "@ 	"@
 !"@ 
"@V "&#,0ttt t 	t
 *t t|	 "&F	FF F 	FZ&A#L
B.b zGgll3::>	G r   