
    i\c                        S r SSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKrSSK	r	SSK
J
r
Jr  SSKJr  SSKJrJrJrJrJr  \" S5      r\S-  S-  r\R,                  R/                  S\" \S	-  S
-  5      5        \R2                  " \R4                  SSS9  \R6                  " S5      rSrSrSr\R@                  RC                  SS5      r"Sr#S\S\\   4S jr$S\S\4S jr%S r& S8S\\   S\'S\\\\4      4S jjr(S9S\S\\   SS4S jjr)S\S\\\\4      4S jr*\4S \S!\'S\\   4S" jjr+S#\\   S\\\,      4S$ jr-\4S \S%\'S\\,   4S& jjr.S\S'\'S\'4S( jr/S:S) jr0S\S*\\   S+\\\,      S,\\\4   S\'4
S- jr1S. r2S/\\\4   S0\'SS4S1 jr3   S;S2\\\\4      S3\4S4\4S\\\4   4S5 jjr5S6 r6\7S7:X  a  \6" 5         gg)<aQ  
YouTube Transcript Ingester
=============================
Takes a list of YouTube video URLs/IDs, fetches transcripts via youtube-transcript-api,
chunks them into ~500-token segments, embeds via Gemini, stores in Qdrant
'research_reports' collection (768-dim), and writes KG entities.

This is the INGESTION side of the YouTube Intel Pipeline. It reads from
the youtube_intel PostgreSQL table (populated by youtube_watchlist_scraper.py)
or accepts explicit video IDs/URLs from the CLI.

Storage targets:
    - Qdrant:      'research_reports' collection (768-dim cosine, nomic-embed / Gemini)
    - PostgreSQL:  youtube_intel.transcript_ingested = TRUE (marks completion)
    - KG entities: E:/genesis-system/KNOWLEDGE_GRAPH/entities/youtube_intel_YYYY-MM-DD.jsonl

Usage:
    # Ingest all pending (not yet ingested) videos from youtube_intel table
    python youtube_transcript_ingester.py --pending

    # Ingest Watch Later priority first (Kinan flagged = important)
    python youtube_transcript_ingester.py --pending --source watchlater

    # Ingest specific video URLs or IDs
    python youtube_transcript_ingester.py --urls https://youtu.be/dQw4w9WgXcQ abc123xyz

    # Ingest from a JSON file of video IDs (output from scraper)
    python youtube_transcript_ingester.py --from-json scraped_videos.json

    # Dry run (fetch transcripts, show stats, don't store)
    python youtube_transcript_ingester.py --pending --dry-run

    # Skip Qdrant, only write KG entities (useful for testing)
    python youtube_transcript_ingester.py --pending --no-qdrant

Author: Genesis System
Version: 1.0.0
Date: 2026-02-23
    N)datetimetimezone)Path)AnyDictListOptionalTuplezE:/genesis-systemKNOWLEDGE_GRAPHentitiesdatazgenesis-memoryz1%(asctime)s [%(levelname)s] %(name)s: %(message)sz%Y-%m-%d %H:%M:%S)levelformatdatefmtyt_transcript_ingesterresearch_reportsi     GEMINI_API_KEY'AIzaSyCT_rx0NusUJWoqtT7uxHAKEfHo129SJb8zmodels/text-embedding-004	url_or_idreturnc                     [         R                  " SU 5      (       a  U $ / SQnU H3  n[         R                  " X 5      nU(       d  M"  UR                  S5      s  $    g)zKExtract 11-char YouTube video ID from URL or return as-is if already an ID.z[a-zA-Z0-9_-]{11})zL(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})z"youtube\.com/v/([a-zA-Z0-9_-]{11})z'youtube\.com/shorts/([a-zA-Z0-9_-]{11})   N)re	fullmatchsearchgroup)r   patternspatms       8E:/genesis-system/scripts/youtube_transcript_ingester.pyextract_video_idr"   S   sS    	||()44H
 IIc%1771:      video_idc                     SU  3$ )Nz https://www.youtube.com/watch?v= )r$   s    r!   build_video_urlr'   c   s    -hZ88r#   c                  V    SSK n SSKJn  U R                  " S0 UR	                  5       D6$ )zConnect to Elestio PostgreSQL.r   N)PostgresConfigr&   )psycopg2elestio_configr)   connectget_connection_params)r*   r)   s     r!   get_db_connectionr.   k   s%    -EnBBDEEr#   sourcelimitc           
         S/n/ nU(       a"  UR                  S5        UR                  U5        SR                  U5      nSU S3nUR                  U5        U R                  5        nUR                  Xd5        UR                   Vs/ s H  oS   PM	     n	nUR                  5        V
s/ s H  n
[        [        X5      5      PM     sn
sSSS5        $ s  snf s  sn
f ! , (       d  f       g= f)zp
Fetch pending (not yet ingested) records from youtube_intel.
Watch Later items (position priority) come first.
ztranscript_ingested = FALSEzsource = %sz AND zt
        SELECT video_id, title, channel_name, source, position, video_url
        FROM youtube_intel
        WHERE z
        ORDER BY
            CASE source WHEN 'watch_later' THEN 0 ELSE 1 END,
            COALESCE(position, 9999) ASC
        LIMIT %s
    r   N)appendjoincursorexecutedescriptionfetchalldictzip)connr/   r0   
conditionsparamswheresqlcurdcolsrows              r!   get_pending_recordsrC   r   s     00JF-(fLL$E g C MM%	#C !oo.o!o.03?S^$? 
.? 
s*   ' CCC)CC
C
C+c                     U(       a  SnX4nOSnU4nU R                  5        nUR                  X45        SSS5        U R                  5         g! , (       d  f       N= f)z5Mark a video as transcript_ingested in youtube_intel.z
            UPDATE youtube_intel
            SET transcript_ingested = TRUE
            WHERE video_id = %s AND source = %s
        zu
            UPDATE youtube_intel
            SET transcript_ingested = TRUE
            WHERE video_id = %s
        N)r4   r5   commit)r:   r$   r/   r>   r<   r?   s         r!   mark_ingestedrF      sS    
 #
 	#C  
KKM 
s   A
Ac           	          SSK Jn  SSKJn  U" 5       nSnSn/ SQS4 He  n U(       a  UR                  XS9nOAUR                  U 5      nU H*  n UR                  XR                  /S9nUR                  n  O   U(       a    OMg     U(       d  gU" 5       n	U	R                  U5      n
/ nU H  nUR                  [        US	5      (       a  UR                  O[        UR!                  S	S
5      5      [        US5      (       a  UR"                  OUR!                  SS5      [        US5      (       a  UR$                  OUR!                  SS5      S.5        M     U U
U['        U
R)                  5       5      US.$ ! [         a    [
        R                  S5         gf = f! [         a     GM]  f = f! [         a    Uc     GM8   GM  f = f)z
Fetch transcript using youtube-transcript-api.

Returns dict with keys: video_id, transcript (str), language, word_count,
segments (list of {text, start, duration}).
Returns None if no transcript is available.
r   )YouTubeTranscriptApi)TextFormatterzPyoutube-transcript-api is not installed.
Fix: pip install youtube-transcript-apiNen)rJ   zen-USzen-AUzen-GB)	languagestext startduration)rL   rN   rO   )r$   
transcriptlanguage
word_countsegments)youtube_transcript_apirH   !youtube_transcript_api.formattersrI   ImportErrorloggererrorfetchlistlanguage_code	Exceptionformat_transcriptr2   hasattrrL   strgetrN   rO   lensplit)r$   rH   rI   ytttranscript_datarQ   	lang_list	availablet	formatter	full_textrS   entrys                r!   fetch_transcriptrk      s   ?C 
 COH 8>		"%))H)"J  HHX.	"A!*-))HHY)*Z#$??	 #   ?* I++O<IH ")%"8"8EJJc%))FTVBW>X$+E7$;$;U[[7TUAV*1%*D*D%))T^`aJb
 	 ! )//+, _  6	
 0 % ! !  	 	sF   E7 -F.&F7F.7FF
F+&F.*F++F..GGrL   	max_wordsc                 :   U R                  5       nU(       d  U R                  5       (       a  U /$ / $ / n[        S[        U5      U5       H@  nSR	                  X$XA-    5      nUR                  5       (       d  M/  UR                  U5        MB     U=(       d    U SS /$ )zn
Split transcript text into chunks of approximately max_words words.
Each chunk will be its own Qdrant point.
r    Ni  )rb   striprangera   r3   r2   )rL   rl   wordschunksichunks         r!   
chunk_textru      s    
 JJLEv-2-F1c%j),1=12;;==MM%  - "d5Dk]"r#   textsc           	      L    SSK Jn  UR                  [        S9  / nU  H,  nUR	                  [
        USS9nUR                  US   5        M.     U$ ! [         aH  n[        R                  SU S35        U  Vs/ s H  n[        U[        S	9PM     Os  snf sns SnA$ SnAff = f)
z
Generate 768-dim embeddings using Gemini text-embedding-004.
Falls back to deterministic hash-based vectors if Gemini is unavailable.
r   N)api_keyretrieval_document)modelcontent	task_type	embeddingzGemini embedding failed (z%). Using hash-based fallback vectors.)dim)google.generativeaigenerativeai	configurer   embed_contentGEMINI_EMBEDDING_MODELr2   r\   rW   warning_hash_vector
VECTOR_DIM)rv   genaivectorsrL   resulterg   s          r!   embed_textsr     s    
@+/D((,. ) F
 NN6+./   @21#5Z[\9>?AQJ/??@s*   AA 
B#B8BBB#B#r~   c                 P   [         R                  " U R                  S5      5      R                  5       nUS-  [	        U5      -  S-   nX#-  n[
        R                  " SU S3USUS-   5      n[        S U 5       5      S-  nUS	:X  a  S
/U-  $ U Vs/ s H  owU-  PM	     sn$ s  snf )zIDeterministic pseudo-vector from SHA-512 hash. Last-resort fallback only.utf-8   r   >fNc              3   *   #    U  H	  oU-  v   M     g 7fNr&   ).0vs     r!   	<genexpr>_hash_vector.<locals>.<genexpr>#  s     %f1ufs   g      ?r   g        )hashlibsha512encodedigestra   structunpacksum)rL   r~   hrepeatexpandedvaluesnormr   s           r!   r   r     s    t{{7+,335AAgQ1$FzH]]Qse1:x	#'':;F%f%%,Dqyus{$%fHf%%%s   B#	chunk_idxc                     U  SU 3R                  S5      n[        R                  " U5      R                  5       n[	        USS S5      S-  $ )z<Deterministic positive 64-bit integer ID for a Qdrant point._r   N   l    )r   r   md5	hexdigestint)r$   r   rawr   s       r!   	_point_idr   )  sL    Ja	{
#
*
*7
3CC""$Aq"vr?///r#   c                 j   SSK JnJn  U R                  5       R                   Vs1 s H  o3R
                  iM     nn[        U;  aL  U R                  [        U" [        UR                  S9S9  [        R                  S[         S[         S35        g
[        R                  S[         S	35        g
s  snf )z:Create 'research_reports' collection if it does not exist.r   )VectorParamsDistance)sizedistance)collection_namevectors_configzCreated Qdrant collection 'z' (z-dim cosine).zQdrant collection 'z' already exists.N)qdrant_client.modelsr   r   get_collectionscollectionsnameQDRANT_COLLECTIONcreate_collectionr   COSINErW   infodebug)clientr   r   cexistings        r!   ensure_qdrant_collectionr   4  s    ; & 6 6 8 D DE D1 DHE(  -'Z(//R 	! 	
 	12C1DC
|S`ab*+<*==NOP Fs   B0rr   r   metadatac                    SSK Jn  / nUR                  SS5      n[        [	        X#5      5       H  u  nu  pUR                  U" [        X5      U
UU[        U5      U	UR                  SS5      UR                  SS5      UUR                  S[        U5      5      UR                  S	S
5      UR                  SS5      [        R                  " [        R                  5      R                  5       SS.S95        M     U(       a9  U R                  [        US9  [         R#                  S[        U5       SU S35        [        U5      $ )zN
Upsert transcript chunks as Qdrant points.
Returns count of points upserted.
r   )PointStructr/   youtubetitlerM   channel_name	video_urlrQ   rJ   rR   youtube_intel)r$   chunk_indextotal_chunksrL   r   r   r/   r   rQ   rR   ingested_atcollection_type)idvectorpayload)r   pointsz	Upserted z chunks for z into Qdrant.)r   r   r`   	enumerater9   r2   r   ra   r'   r   nowr   utc	isoformatupsertr   rW   r   )r   r$   rr   r   r   r   r   r/   rs   rt   r   s              r!   upsert_to_qdrantr   C  s    1F\\(I.F'F(<=?EX) (#$$'K!%\\'26$,LL$D$!)k?8;T!U (Z >"*,,|Q"?#+<<#=#G#G#I'6	
 >, &7GyV\(=QRv;r#   c                  ^    SSK Jn   SSKJn  U" 5       nU " UR                  UR
                  S9$ )z1Get configured Qdrant client from elestio_config.r   )QdrantClient)QdrantConfig)urlrx   )qdrant_clientr   r+   r   r   rx   )r   r   cfgs      r!   get_qdrant_clientr   o  s$    *+
.CCGGS[[99r#   recordchunks_countc                    [         R                  " [        R                  5      R	                  S5      n[
        SU S3-  n[
        R                  SSS9  SU S    S[        [        R                  " 5       5       3S	U S   U R                  S
S5      U R                  SS5      U R                  S[        U S   5      5      U R                  SS5      U R                  S5      U R                  SS5      U R                  SS5      U[        [         R                  " [        R                  5      R                  5       SSU R                  SS5      /S.n[        USSS9 nUR                  [        R                   " USS9S-   5        SSS5        ["        R%                  SU S    S UR&                   35        g! , (       d  f       N7= f)!zi
Write a KG entity JSONL entry for this video to
KNOWLEDGE_GRAPH/entities/youtube_intel_YYYY-MM-DD.jsonl
z%Y-%m-%dyoutube_intel_z.jsonlT)parentsexist_ok	yt_intel_r$   r   youtube_video_intelr   rM   r   r   r/   unknownpositionrQ   rJ   rR   r   r   intel)r   typer$   r   r   r   r/   r   rQ   rR   chunks_storedqdrant_collectionr   tagsar   encodingF)ensure_ascii
NzKG entity written for z -> )r   r   r   r   strftimeKG_ENTITIES_DIRmkdirr   timer`   r'   r   r   openwritejsondumpsrW   r   r   )r   r   todayentity_fileentityr   s         r!   write_kg_entityr   |  s~   
 LL&//
;E!nUG6$BBK$6 &,-Qs499;/?.@A%:&GR(

>26ZZ_VJ=O-PQ**Xy1JJz*JJz40jjq1%.||HLL1;;=GVZZ"%=>F" 
k3	1Q	

66=> 
2 LL)&*<)=T+BRBRASTU 
2	1s   (F44
Gvideo_records
use_qdrantdry_runc                 *   [        U 5      SSSSS/ S.nSnU(       a  U(       d   [        5       n[        U5        USL nU(       a  U(       d
  [        5       n [        U SS9 GHk  u  pU	R                  S	5      n
U
(       d  [        R                  S
U S35        M9  U	R                  SS5      nU	R                  SS5      n[        R                  SU S[        U 5       SU
 SU SU	R                  SS5       SUSS  35        Sn [        U
5      nU(       d@  [        R                  SU
 S35        US==   S-  ss'   US   R                  U
USS.5        M  US ==   S-  ss'   US!   nUS"   n[        R                  S#U S$U 35        U(       a  US   R                  U
US%UUS&.5        GMK  [        US'   5      n[        R                  S([        U5       S)35        SnU(       aW  U(       aP  U(       aI   [        U5      n0 U	EUUS*.En[        XZUUU5      nUS+==   U-  ss'   [        R                  S,U S-35         0 U	EUUS*.En[!        UU=(       d    [        U5      S/9  US0==   S-  ss'   U(       a   [#        X:US:w  a  UOSS29  US   R                  U
US5UUUS6.5        U[        U 5      :  d  GMU  [$        R&                  " S75        GMn     U(       a  U(       a  UR)                  5         U$ ! [         a&  n[        R                  SU S35        Sn SnAGNSnAff = f! [         a&  n[        R                  SU
 SU 35         SnAGN@SnAff = f! [         a&  n[        R                  S.U
 SU 35         SnAGNFSnAff = f! [         a&  n[        R                  S1U
 SU 35         SnAGNFSnAff = f! [         a&  n[        R                  S3U
 S4U 35         SnAGN_SnAff = f! U(       a  U(       a  UR)                  5         f f f = f)8a  
Core ingestion loop.

Args:
    video_records: List of dicts with at minimum 'video_id'. May also include
                   title, channel_name, source, position, video_url.
    use_qdrant:    Whether to embed and store in Qdrant.
    dry_run:       If True, fetch transcripts but do not store anything.
    conn:          Open PostgreSQL connection. If None, opens its own.

Returns:
    Summary dict.
r   )totaltranscripts_fetchedno_transcriptqdrant_chunkskg_entitiesfailedvideo_resultsNzQdrant init failed: z. Will skip vector storage.Fr   )rN   r$   zRecord z missing video_id, skipping.r   rM   r/   r   [/z] Ingesting z (z pos=r   ?z): <   z  Transcript fetch error for z: z  No transcript available for .r  r  )r$   r   statusr  rR   rQ   z  Transcript: z words, lang=
dry_run_ok)r$   r   r  rR   rQ   rP   z  Chunked into z
 segments.)rQ   rR   r  z
  Qdrant: z chunks stored.z  Qdrant upsert failed for )r   r  z  KG entity write failed for )r/   z  Failed to mark z as ingested: success)r$   r   r  rR   rQ   rr   g?)ra   r   r   r\   rW   rX   r.   r   r`   r   r   rk   r2   ru   r   r   r   rF   r   sleepclose)r   r   r   r:   resultsr   r   	owns_connrs   r   r$   r   r/   rd   rR   rQ   rr   chunks_upsertedr   meta	kg_records                        r!   ingest_videosr    sR   ( ]# G M'	-/M$]3
 I "f"=:IAzz*-H+GHIJJw+EZZ)4FKKA3aM*+<z B85J!< =SsN #OP"28"< #!?zKL(A-((// ("-1 
 )*a/*(6J&z2HKK.M(LM(// ("*", (1     =>FKK/#f+jAB  OmP)&1G $,&0D
 '7%$'O O,?,KK*_,=_ MN
	P (",	
  	8V3v;W&!+&
 T!$6YCV\`a O$++$#($)-  3}%%

3C ;H JJLNa  	LL/s2MNOJ	4  P!>xj1#NOOP^ ! PLL#>xj1#!NOOP  P!>xj1#NOOP ! TNN%6xjqc#RSST" JJL 9s   K1 B,O0 L$C$O0 4AM=2N
/O0 7N=	(O0 6O0 1
L!;LL!$
M.M	O0 MO0 
N!N<O0 NO0 

N:N5/O0 5N::O0 =
O-O("O0 (O--O0 0"Pc                     [         R                  " S[         R                  S9n U R                  SS9nUR	                  SSSS9  UR	                  S	S
SSS9  UR	                  SSSS9  U R	                  SSS/SS9  U R	                  S[
        SSS9  U R	                  SSSS9  U R	                  SSSS9  U R                  5       n/ nUR                  (       ar  [        R                  S5         [        5       n[        XBR                  UR                  S 9nUR                  5         [        R                  S#['        U5       S$35        GOUR(                  (       a  UR(                   HO  n[+        U5      nU(       a"  UR-                  US%S%S&[/        U5      S'.5        M7  [        R1                  S(U 35        MQ     [        R                  S)['        U5       S*35        GOMUR2                  (       Ga;  [5        UR2                  5      nUR7                  5       (       d.  [        R!                  S+U 35        ["        R$                  " S"5        [9        US,S-9 n	[:        R<                  " U	5      n
S S S 5        [?        W
[@        5      (       a  U
nOs[?        U
[B        5      (       a3  S.U
;   a-  U
S.    H#  nUR-                  US%S%S/[/        U5      S'.5        M%     O+[        R!                  S05        ["        R$                  " S"5        [        R                  S1['        U5       S2URD                   S335        U(       d9  [        R1                  S45        [G        [:        RH                  " S5S5S5S6.5      5        g [K        UURL                  (       + URN                  S79nURQ                  5        VVs0 s H  u  pUS8:w  d  M  X_M     nnn[G        [:        RH                  " US9S:95        [        R                  S;US<    S=US>    S?US@    SAUSB    SC3	5        g ! [         a9  n[        R!                  S!U 35        ["        R$                  " S"5         S nAGN S nAff = f! , (       d  f       GN= fs  snnf )DNzGIngest YouTube transcripts into Genesis Qdrant 'research_reports' + KG.)r6   formatter_classT)requiredz	--pending
store_truezIIngest all videos from youtube_intel table with transcript_ingested=FALSE)actionhelpz--urls+	URL_OR_IDz,Specific YouTube URLs or video IDs to ingest)nargsmetavarr  z--from-jsonFILEzOJSON file with list of video records (output from youtube_watchlist_scraper.py))r!  r  z--sourcewatch_historywatch_laterz9Filter --pending by source (watch_history or watch_later))choicesr  z--limitr   z8Max videos to ingest when using --pending (default: 500))r   defaultr  z	--dry-runz:Fetch transcripts but do not store to Qdrant or PostgreSQLz--no-qdrantz.Skip Qdrant embedding (only write KG entities)z2Loading pending videos from youtube_intel table...)r/   r0   z Failed to load pending records: r   zFound z pending videos.rM   manual)r$   r   r   r/   r   z!Could not extract video ID from: z	Prepared z videos from CLI arguments.zJSON file not found: r   r   	video_idsjson_importz>JSON file must be a list of records or have a 'video_ids' key.zLoaded z videos from r  zNo videos to ingest.r   )r  r  r  )r   r   r   r     )indentzIngestion complete: r  r	  r  z transcripts, r  z Qdrant chunks, r  z KG entities.))argparseArgumentParserRawDescriptionHelpFormatteradd_mutually_exclusive_groupadd_argumentr   
parse_argspendingrW   r   r.   rC   r/   r0   r  r\   rX   sysexitra   urlsr"   r2   r'   r   	from_jsonr   existsr   r   load
isinstancerZ   r8   r   printr   r  	no_qdrantr   items)parsersource_groupargsr   r:   r   r   vid	json_pathr   r   r  kr   summarys                  r!   mainrD  9  s   $$] <<F 666ELX  
 ;	   ^    -0H  
 G	   I  
 =   D +-M||HI	$&D/[[

M JJL 	fS/00@AB	I"9-C$$ #$&&!0!5&  !B9+NO # 	iM 233NOP	(	!!LL0<=HHQK)g.!99Q<D / dD!! Md##t(;K($$ #$&+!0!5&  ) LLYZHHQKgc-01y~~>NaPQ-.djj1QQRSTU #~~%G !(H13GtqtGH	$**WQ
'(
KK
w'<=>a@P?QQ_?#
$$4W]5K4LM	[y  	LL;A3?@HHQKK	2 /.@ Is0   +8P QQ/Q
Q.QQ
Q__main__)Nr   r   )r   N)TFN)8__doc__r,  r   r   loggingosr   r   r3  r   r   r   pathlibr   typingr   r   r   r	   r
   GENESIS_ROOTr   pathinsertr_   basicConfigINFO	getLoggerrW   r   r   CHUNK_MAX_TOKENSenvironr`   r   r   r"   r'   r.   r   rC   rF   rk   ru   floatr   r   r   r   r   r   r   boolr  rD  __name__r&   r#   r!   <module>rV     s  &P     	 	  
  '  3 3
 '(!22Z? 3|f,/??@ A   
,,>
 
		3	4
 ' 
  02[\4    9c 9c 9F 69@3-@/2@	$sCx.@@# x}  4@s @xS#X'? @N ,< #S #S #S	 #(@tCy @T$u+%6 @, (2 	&s 	& 	&d5k 	&0 0 0 0Q)) I) $u+	)
 38n) 	)X:VDcN V# V$ VL 		RS#X'RR R
 
#s(^Rr~B zF r#   