
    ic                     R   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m	Z	m
Z
mZ ddlmZ ddlmZmZmZmZmZ ej(                  j+                  dd        ej,                  ej.                  dd	        ej0                  d
      Zej4                  j7                  dd      ZdZ ed      ZdZdZ d Z!d Z"dee#   de$fdZ%dee#ef   de&fdZ'd5dee#   dee#   fdZ(de#deee#ef      fdZ)de#deee#ef      fdZ*	 d6de#de#de+de,deee#ef      f
d Z-de#d!e#d"ee#ef   fd#Z.d7d$e#d%e,dee#   fd&Z/d'ee#   deee0      fd(Z1d8d$e#d)e,dee0   fd*Z2de#d+e,de,fd,Z3dee#ef   fd-Z4	 	 	 d9dee#   d.e&d/e&d0e&dee#ef   f
d1Z5dee#   dee#ef   fd2Z6d3 Z7e8d4k(  r e7        yy):a  
YouTube Transcript Extractor
=============================
Extracts transcripts for videos identified by the watch history fetcher.

Two-tier extraction:
    1. PRIMARY: youtube-transcript-api (free, no API key needed)
    2. FALLBACK: Supadata.ai API (paid, handles edge cases, AI-generated transcripts)

Stores results in:
    - PostgreSQL: yt_transcripts table (structured storage)
    - Qdrant: Semantic vector embeddings for similarity search
    - Local JSON: backup copies for offline access

Usage:
    # Extract transcripts for specific video IDs
    python youtube_transcript_extractor.py --video-ids dQw4w9WgXcQ abc123 xyz789

    # Extract all videos from today's watch history (reads from PostgreSQL)
    python youtube_transcript_extractor.py --today

    # Extract from a date range
    python youtube_transcript_extractor.py --date 2026-02-15

    # Dry run (don't store, just print)
    python youtube_transcript_extractor.py --today --dry-run

Author: Genesis System
Version: 1.0.0
    N)datetime	timedeltatimezone)Path)ListDictAnyOptionalTuplez)/mnt/e/genesis-system/data/genesis-memoryz1%(asctime)s [%(levelname)s] %(name)s: %(message)sz%Y-%m-%d %H:%M:%S)levelformatdatefmtyt_transcript_extractorSUPADATA_API_KEY#sd_4b8009caa1fd18698793e2a86117b07czhttps://api.supadata.ai/v1z./mnt/e/genesis-system/data/youtube_transcriptsyt_transcriptsztext-embedding-3-smallc                  V    ddl } ddlm}  | j                  di |j	                         S )z/Get PostgreSQL connection using Elestio config.r   N)PostgresConfig )psycopg2elestio_configr   connectget_connection_params)r   r   s     =/mnt/e/genesis-system/scripts/youtube_transcript_extractor.pyget_db_connectionr   J   s'    -8EnBBDEE    c                     | j                         5 }|j                  d       | j                          ddd       t        j	                  d       y# 1 sw Y   xY w)z1Create the transcripts table if it doesn't exist.a  
            CREATE TABLE IF NOT EXISTS yt_transcripts (
                id SERIAL PRIMARY KEY,
                video_id VARCHAR(20) NOT NULL,
                transcript TEXT,
                language VARCHAR(10) DEFAULT 'en',
                word_count INTEGER,
                extraction_method VARCHAR(20) DEFAULT 'youtube_api',
                extracted_topics TEXT[],
                extracted_insights JSONB,
                created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
                UNIQUE(video_id)
            );

            CREATE INDEX IF NOT EXISTS idx_yt_transcript_video
                ON yt_transcripts(video_id);
            CREATE INDEX IF NOT EXISTS idx_yt_transcript_date
                ON yt_transcripts(created_at);
        Nz#Transcript schema verified/created.)cursorexecutecommitloggerinfo)conncurs     r   ensure_transcript_schemar%   Q   sM    	 #  	& 	)* KK56+ s   "AA	video_idsreturnc                     |s
t               S | j                         5 }|j                  d|f       |j                         D ch c]  }|d   	 c}cddd       S c c}w # 1 sw Y   yxY w)z/Check which video IDs already have transcripts.z<SELECT video_id FROM yt_transcripts WHERE video_id = ANY(%s)r   N)setr   r   fetchall)r#   r&   r$   rows       r   get_existing_transcriptsr,   k   sd    u	 2#JL	
 #&,,.13A12 2
 22 2s   &A AA A  A)recordc                    	 | j                         5 }|j                  d|d   |d   |j                  dd      |j                  dd      |j                  dd	      |j                  d
g       t        j                  |j                  di             f       | j                          	 ddd       y# 1 sw Y   yxY w# t        $ r8}t        j                  d|d    d|        | j                          Y d}~yd}~ww xY w)z(Store a transcript record in PostgreSQL.a  
                INSERT INTO yt_transcripts
                    (video_id, transcript, language, word_count,
                     extraction_method, extracted_topics, extracted_insights)
                VALUES (%s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (video_id) DO UPDATE SET
                    transcript = EXCLUDED.transcript,
                    language = EXCLUDED.language,
                    word_count = EXCLUDED.word_count,
                    extraction_method = EXCLUDED.extraction_method,
                    extracted_topics = EXCLUDED.extracted_topics,
                    extracted_insights = EXCLUDED.extracted_insights
            video_id
transcriptlanguageen
word_countr   extraction_methodunknownextracted_topicsextracted_insightsNTzFailed to store transcript for : F)
r   r   getjsondumpsr    	Exceptionr!   errorrollback)r#   r-   r$   es       r   store_transcriptr@   w   s    [[] 	cKK  z"|$

:t,

<+

.	:

-r2

6::&:B?@* KKM/	 	 	0  6vj7I6J"QCPQs5   B; BB/&B; /B84B; 8B; ;	C<.C77C<date_strc                    |r5t        j                  |d      j                  t        j                        }n#t        j
                  t        j                        }|j                  dddd      }|t        d      z   }| j                         5 }|j                  d||f       |j                         D cg c]  }|d   	 c}cddd       S c c}w # 1 sw Y   yxY w)	z2Get video IDs from watch history for a given date.z%Y-%m-%d)tzinfor   )hourminutesecondmicrosecond   )daysz
            SELECT DISTINCT video_id FROM yt_watch_history
            WHERE watched_at >= %s AND watched_at < %s
            ORDER BY watched_at
        N)
r   strptimereplacer   utcnowr   r   r   r*   )r#   rA   targetstartendr$   r+   s          r   get_video_ids_for_daterQ      s    ""8Z8@@@Uhll+NN!A1NEE
)#
#C	 2#  S\		
 #&,,.13A12 2 22 2s   'C6CCCCr/   c           	         	 ddl m} ddlm} 	  |       }d}d}	 |j                  | g d      }|st
        j                  d	|  d
       y |       }|j                  |      }	g }
|D ]  }|
j                  t        |d      r|j                  nt!        |j#                  dd            t        |d      r|j$                  n|j#                  dd      t        |d      r|j&                  n|j#                  dd      d        t)        |	j+                               }| |	|
||ddS # t        $ r t
        j                  d       Y yw xY w# t        $ rj 	 |j                  |       }|rB|D ]=  }	 |j                  | |j                  g      }|j                  } n# t        $ r Y ;w xY w n# t        $ r Y nw xY wY w xY w# t        $ r%}t
        j                  d|  d|        Y d}~yd}~ww xY w)z
    Extract transcript using the youtube-transcript-api package.
    Free, no API key needed, works for most videos with captions.
    r   )YouTubeTranscriptApi)TextFormatterzSyoutube-transcript-api required. Install with:
  pip install youtube-transcript-apiNr2   )r2   zen-USzen-AUzen-GB)	languageszNo transcript available for z via youtube-transcript-apitext rO   duration)rV   rO   rX   youtube_transcript_apir/   r0   segmentsr1   r3   r4   z"youtube-transcript-api failed for r8   )rY   rS   !youtube_transcript_api.formattersrT   ImportErrorr!   r=   fetchr<   listlanguage_codewarningformat_transcriptappendhasattrrV   strr9   rO   rX   lensplit)r/   rS   rT   ytt_apitranscript_datar1   transcript_listt	formatter	full_textr[   entryr3   r?   s                 r   extract_via_youtube_apiro      s   
?C7&( 	%mmH@ambO" NN9(C^_` "O	//@	 $ 	EOO&-eV&<

#eiiPVXZF[B\(/w(?UYYwXYEZ.5eZ.HENNeiiXbdeNf 	 *+
 !#  $!9
 	
g  3	
    	"),,x"8", %%.5mmHQRQ`Q`Pam.bO'(H!( %$%%  	X  ;H:RsKLs   D% F? E	 F? CF? %EE		F<F),*FF)	F$!F)#F$$F)(F<)	F52F<4F55F<8F? ;F<<F? ?	G-G((G-c                    	 ddl }t        rt        j                  d      r	 t        dd}d|  dd	d
}	 |j                  t         d||d      }|j                  dk(  r0|j                         }|j                  d      }|rt        || |      S |j                  dk(  rt        j                  d|         y|j                  dk(  rt        j                  d|         y|j                  dk(  r+t        j                  d       t        j                  d       y|j                  dk7  r3t        j                  d|j                   d|j                  dd         y|j                         }|j                  dd      }|st        j                  d|         yt        |t               rdj#                  d |D              }|}	n|}g }	t%        |j'                               }
|j                  dd       }| ||	||
d!d"S # t        $ r t        j                  d       Y yw xY w# t(        $ r%}t        j                  d#|  d|        Y d}~yd}~ww xY w)$zv
    Extract transcript using Supadata.ai API.
    Paid service, handles edge cases, can generate AI transcripts.
    r   Nz/requests library required: pip install requestssd_zapplication/json)z	x-api-keyzContent-Typez https://www.youtube.com/watch?v=autotrue)urlmoderV   z/transcript<   )headersparamstimeout   jobId   z%Supadata: transcript unavailable for i  zSupadata: video not found i  z"Supadata: rate limited, waiting...      zSupadata error r8   contentrW   z$Supadata returned empty content for  c              3   @   K   | ]  }|j                  d d        ywrV   rW   Nr9   ).0segs     r   	<genexpr>z'extract_via_supadata.<locals>.<genexpr>?  s       !(+#!   langr2   supadatarZ   zSupadata failed for )requestsr]   r!   r=   r   
startswithr9   SUPADATA_BASE_URLstatus_coder:   _poll_supadata_jobra   timesleeprV   
isinstancer_   joinrf   rg   r<   )r/   r   rw   rx   responsedatajob_idr   rm   r[   r3   r1   r?   s                r   extract_via_supadatar      s^   

 /::5A &*G 2(<F?<< !-	   
 3&==?DXXg&F)&(GDD3&NNB8*MN3&NN7zBC3&NN?@JJqM3&NN_X-A-A,B"X]]SWTWEXDYZ[}}((9b)NNA(LM gt$ !/6! I HIH*+
88FD) !#  $!+
 	
O  FG`  +H:Rs;<sO   G< AH  'H  8'H   9H  AH  <H  A"H  <HH 	I)I		Ir   rw   max_attemptsc           	         ddl }t        |      D ]  }t        j                  d       	 |j	                  t
         d|  |d      }|j                  dk(  r|j                         }|j	                  d      }|d	v rl|d
k(  rt        j                  d|         y|j	                  dd      }	t        |	t              rdj                  d |	D              }
n|	}
||
t        |	t              r|	ng |j	                  dd      t        |
j                               ddc S |j                  dk(  r t        j                  d|        y# t        $ r#}t        j                  d|        Y d}~`d}~ww xY w)z'Poll Supadata async job until complete.r   NrH   z/transcript/   )rw   ry   r~   status)queuedactivefailedzSupadata job failed for r   rW   r   c              3   @   K   | ]  }|j                  d d        ywr   r   )r   ss     r   r   z%_poll_supadata_job.<locals>.<genexpr>q  s     (Lqvr):(Lr   r   r2   r   rZ   rz   zJob poll error: zSupadata job timed out for )r   ranger   r   r9   r   r   r:   r!   ra   r   r_   r   rf   rg   r<   )r   r/   rw   r   r   attemptrespr   r   r   rm   r?   s               r   r   r   X  su    & "3

1 	3<<$%\&:   D
 3&yy{(+11X%NN%=hZ#HI((9b1gt, #(LG(L LI 'I !)"++5gt+D" $ 6"%ioo&7"8)3  !!S( )?"3H NN0
;<	  	3NN-aS122	3s+   AE	<E	A>E	E			E5E00E5r0   metadatac                    	 ddl m} ddlm}m}m}m} ddlm} 	  |       }	 ||	j                  |	j                        }
|
j                         j                  D cg c]  }|j                    }}t"        |vrE|
j%                  t"         |d|j&                  	      
       t        j)                  dt"                t+        |d      }t-        |      }|st        j                  d|  d       yg }t/        t1        ||            D ]f  \  }\  }}t3        | |      }|j5                   |||| |||j7                  dd      |j7                  dd      |j7                  dd      dd             h |
j9                  t"        |       t        j)                  dt;        |       d|  d       y# t        $ r t        j                  d       Y yw xY wc c}w # t<        $ r%}t        j?                  d|  d|        Y d}~yd}~ww xY w)z}
    Embed transcript chunks into Qdrant for semantic search.
    Uses Gemini embedding or falls back to simple hashing.
    r   )QdrantClient)VectorParamsDistancePointStructCollectionStatus)QdrantConfigz7Qdrant client not available. Skipping vector embedding.F)rt   api_key   )sizedistance)collection_namevectors_configzCreated Qdrant collection:   )	max_wordszNo embeddings generated for z. Skipping Qdrant.titlerW   channel_name
watched_atyoutube_pipeline)r/   chunk_indexrV   r   channelr   source)idvectorpayload)r   pointsz	Embedded z chunks for z into Qdrant.TzQdrant embedding failed for r8   N) qdrant_clientr   qdrant_client.modelsr   r   r   r   r   r   r]   r!   ra   rt   r   get_collectionscollectionsnameQDRANT_COLLECTIONcreate_collectionCOSINEr"   _chunk_text_generate_embeddings	enumeratezip_generate_point_idrc   r9   upsertrf   r<   r=   )r/   r0   r   r   r   r   r   r   r   configclientcr   chunksvectorsr   ichunkr   point_idr?   s                        r   embed_transcript_to_qdrantr     s   
	.	
 	
 	0
1&**fnnE (.'='='?'K'KL!qvvLLK/$$ 1+%__  %  KK56G5HIJ Z37 'v.NN9(CUVW "+C,@"A 	Av)(A6HMM+ (#$!%\\'26'||NB?"*,,|R"@0 	  	&7GiF}L
-PQc  PQ MT  3H:RsCDsC   F/ AG G/BG 2B<G /GGG 	H!HHrV   r   c                     | j                         }g }t        dt        |      |      D ];  }dj                  ||||z          }|j	                         s+|j                  |       = |r|S | dd gS )z2Split text into chunks of approximately max_words.r   r   Ni  )rg   r   rf   r   striprc   )rV   r   wordsr   r   r   s         r   r   r     ss    JJLEF1c%j), !qY/0;;=MM% ! 6.$u+.r   r   c                 z   	 ddl m} t        j                  j	                  dd      }|j                  |       g }| D ]*  }|j                  d|d      }|j                  |d	          , |S # t        $ rD}t        j                  d
| d       | D cg c]  }t        |d       nc c}w c}cY d}~S d}~ww xY w)z
    Generate vector embeddings for text chunks.
    Tries Gemini embedding API first, falls back to simple hash-based vectors.
    r   NGEMINI_API_KEY'AIzaSyCT_rx0NusUJWoqtT7uxHAKEfHo129SJb8)r   zmodels/text-embedding-004retrieval_document)modelr   	task_type	embeddingzGemini embedding failed: z. Using fallback hash vectors.r   )dim)google.generativeaigenerativeaiosenvironr9   	configureembed_contentrc   r<   r!   ra   _hash_vector)r   genair   r   r   resultr?   s          r   r   r     s    
B+**..!13\]( 	0E((1. ) F
 NN6+./	0  B21#5STU:@AU,AAABs*   A*A- -	B:6B5B('B5/B:5B:r   c                 B   t        j                  | j                               j                         }||dz  t	        |      z  dz   z  }ddl}|j                  d| d|d|dz         }t        d |D              dz  }|dk(  rd	g|z  S |D cg c]  }||z  	 c}S c c}w )
zEGenerate a deterministic pseudo-vector from text hash. Fallback only.   rH   r   N>fc              3   &   K   | ]	  }||z    y wNr   )r   vs     r   r   z_hash_vector.<locals>.<genexpr>  s     %q1u%s         ?g        )hashlibsha512encodedigestrf   structunpacksum)rV   r   hexpandedr   valuesnormr   s           r   r   r     s    t{{}%,,.AS1WA&!+,H]]Qse1:xq'9:F%f%%,Dqyus{$%AH%%%s   Br   c                     |  d| j                         }t        j                  |      j                         }t	        |dd d      dz  S )z7Generate a deterministic integer ID for a Qdrant point._N   l    )r   r   md5	hexdigestint)r/   r   rawr   s       r   r   r     sJ    Ja}
%
,
,
.CC""$Aq"vr?///r   c                     t         j                  dd       t         | d    dz  }t        |dd      5 }t        j                  | |dd	
       ddd       t
        j                  d|        y# 1 sw Y   "xY w)z9Save a JSON backup of the transcript to local filesystem.T)parentsexist_okr/   z.jsonwzutf-8)encoding   F)indentensure_asciiNzBackup saved: )TRANSCRIPT_BACKUP_DIRmkdiropenr:   dumpr!   debug)r-   filepathr   s      r   save_transcript_backupr    st    t<$&*<)=U'CCH	hg	. ;!		&!AE:;
LL>(,-; ;s   A00A9skip_existingembed_vectorsdry_runc           
         t               }	 t        |       |r[t        ||       }| D cg c]	  }||vs| }}t        j	                  dt        |        dt        |       dt        |       d       n| }t        ||       }t        |       t        |       t        |      z
  dddddg d}	t        |d      D ]  \  }
}t        j	                  d	|
 d
t        |       d| d       t        |      }|$t        j	                  d| d       t        |      }|>t        j                  d|        |	dxx   dz  cc<   |	d   j                  |ddd       |	dxx   dz  cc<   |	d   j                  |d   d      dz   |	d   |d   <   |rBt        j	                  d|d    d|d           |	d   j                  |d|d   |d   d       t        ||       t        |       |r"|j                  |i       }t        ||d   |       |	d   j                  |d|d   |d   d       |
t        |      k  st!        j"                  d        |	|j%                          S c c}w # |j%                          w xY w)zr
    Extract transcripts for a list of video IDs.

    Returns:
        Summary dict with counts and results.
    zVideos: z total, z already have transcripts, z to extract.r   )rY   r   )totalskipped_existing	extractedr   methodsvideo_resultsrH   [/z] Extracting transcript for z...z  Falling back to Supadata for z&  FAILED: No transcript available for r   r  no_transcript_available)r/   r   reasonr  r  r4   z  OK: r3   z words via extracted_dry_run)r/   r   r3   methodr0   successr   )r   r%   r,   r!   r"   rf   _load_video_metadatar   ro   r   ra   rc   r9   r@   r  r   r   r   close)r&   r  r  r  r#   existingvidpendingmetadata_mapresultsr   r/   r-   metas                 r   extract_transcriptsr*    s    DY & /i@H&/Gs3h3FsGGGKK3y>*(3x=/A\w<..
  G ,D)< ^ #IW =23C
 %Wa0 :	 KAxKK!A3aG~-I(SVWX -X6F ~=hZsKL-h7~!GzRS!Q&!(// (&71 
 K A% 	"&&v.A'BAFJ Iv&9:; VL12+fEX>Y=Z[ (// (1"("6$%89	1   T6* #6* #''"5*8VL5I4PO$++$#$\2 !45	-  3w<

3u:	 x  	

g Hf 	

s'   I& 	I!I!G>I& 7I& !I& &I8c                 l   |si S i }	 | j                         5 }|j                  d|f       |j                         D ]0  }|d   |d   |d   |d   r|d   j                         ndd||d   <   2 	 d	d	d	       |S # 1 sw Y   |S xY w# t        $ r#}t
        j                  d
|        Y d	}~|S d	}~ww xY w)z-Load video metadata from watch history table.a  
                SELECT DISTINCT ON (video_id)
                    video_id, title, channel_name, channel_id, watched_at
                FROM yt_watch_history
                WHERE video_id = ANY(%s)
                ORDER BY video_id, watched_at DESC
            rH   r     r   rW   )r   r   
channel_idr   r   NzFailed to load video metadata: )r   r   r*   	isoformatr<   r!   ra   )r#   r&   r   r$   r+   r?   s         r   r"  r"    s    	H>[[] 	cKK   ||~  V$'F"%a&8;A#a&"2"2"4B	$Q 	$ O%	$ O  >8<==O>s5   B AA:0B :B?B B 	B3B..B3c                     t        j                  d      } | j                  ddd       | j                  ddd	
       | j                  dd       | j                  dddd       | j                  ddd
       | j                  ddd
       | j                  ddd
       | j                         }g }|j                  r|j                  }n[|j
                  s|j                  r2t               }	 t        ||j                        }|j                          n| j                  d       |s8t        j                  d       t        t        j                  dddd             y t        j!                  dt#        |       d       t%        ||j&                   |j(                   |j*                        }t        t        j                  |dt,                      y # |j                          w xY w)!Nz7Extract YouTube transcripts for Genesis memory pipeline)descriptionz--video-ids+zSpecific video IDs to extract)nargshelpz--today
store_truez=Extract transcripts for all videos from today's watch history)actionr3  z--datezFExtract transcripts for videos watched on a specific date (YYYY-MM-DD))r3  z--skip-existingTz9Skip videos that already have transcripts (default: True))r5  defaultr3  z--forcez$Re-extract even if transcript existsz--no-vectorszSkip Qdrant vector embeddingz	--dry-runz Extract but don't store anythingz,Must specify --video-ids, --today, or --datezNo video IDs to process.r   )r  r  r   zProcessing z
 videos...)r&   r  r  r  r  )r	  r6  )argparseArgumentParseradd_argument
parse_argsr&   todaydater   rQ   r#  r=   r!   ra   printr:   r;   r"   rf   r*  force
no_vectorsr  re   )parserargsr&   r#   r(  s        r   mainrB    s   $$MF ,  
 L  
 U   H	   3  
 +  
 /   D I~~NN		tyy "	.tTYY?IJJLCD12djj11BCD
KK+c)n-Z89!**n//)	G 
$**WQ
45% JJLs   1G G,__main__r   )rv   )r   )r   )TTF)9__doc__r7  r   r:   loggingr   resysr   r   r   r   pathlibr   typingr   r   r	   r
   r   pathinsertbasicConfigINFO	getLoggerr!   r   r9   r   r   r  r   EMBEDDING_MODELr   r%   re   r)   r,   boolr@   rQ   ro   r   dictr  r   r   r   floatr   r   r   r  r*  r"  rB  __name__r   r   r   <module>rT     s  >     	 	 
  2 2  3 3 > ?   
,,>
 
		4	5 ::>>)  1 MN $ *F74	2d3i 	2C 	24S> d B28C= 2DI 2.Fc FhtCH~.F FZY3 Y8DcN+C Yz DF+++)-+=@+d38n+dA A# AcSVh AH/c /c /DI /Bc BtDK/@ B4&s & &tE{ &0 03 03 0.4S> . 	fCyff f 	f
 
#s(^fR$s) S$Y @E6P zF r   