
    i&c                        S r SSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKJrJ	r	  SSK
Jr  SSKJrJrJrJr  \" S5      r\R$                  R'                  S\" \S-  S-  5      5        \R*                  " \R,                  S	S
S9  \R.                  " S5      rSrSr\" \S-  S-  5      rSrSrS rS3S jrS\\\\4      S\ 4S jr!S4S\\   S\ S\\   4S jjr"S\S\\   4S jr#S\S\4S jr$S\S\4S jr%S5S \ S!\ SS4S" jjr&S6S#\ S\'4S$ jjr(S\'4S% jr)\4S\ S\\\\4      4S& jjr*\4S\ S\\\\4      4S' jjr+\\\SS(S4S)\\   S*\ S+\ S,\S-\\   S.\'S/\\   S\\\4   4S0 jjr,S1 r-\.S2:X  a  \-" 5         gg)7aJ  
YouTube Watchlist Scraper
==========================
Uses Playwright to scrape YouTube Watch History and Watch Later playlist
from an authenticated browser session (sunvision07@gmail.com).

This is the BROWSER-FIRST strategy - it expects the browser to already be
authenticated. It works with the Genesis Collaborative Browser Handoff protocol:
  1. Human logs into sunvision07@gmail.com in Chrome
  2. This script attaches to that session (or uses persistent profile)
  3. Scrapes Watch History + Watch Later
  4. Persists to PostgreSQL youtube_intel table

Usage:
    # Scrape both Watch History and Watch Later (default)
    python youtube_watchlist_scraper.py

    # Scrape only Watch History (recent 50 videos)
    python youtube_watchlist_scraper.py --source history --limit 50

    # Scrape only Watch Later
    python youtube_watchlist_scraper.py --source watchlater

    # Use a specific browser profile directory (for persistent auth)
    python youtube_watchlist_scraper.py --profile-dir "E:/genesis-system/config/browser_profile_yt"

    # Output to JSON file only (no DB write)
    python youtube_watchlist_scraper.py --dry-run --output-file scraped_videos.json

    # Connect to existing Chrome session via CDP
    python youtube_watchlist_scraper.py --cdp-url http://localhost:9222

Author: Genesis System
Version: 1.0.0
Date: 2026-02-23
    N)datetimetimezone)Path)ListDictAnyOptionalzE:/genesis-systemdatazgenesis-memoryz1%(asctime)s [%(levelname)s] %(name)s: %(message)sz%Y-%m-%d %H:%M:%S)levelformatdatefmtyt_watchlist_scraperz$https://www.youtube.com/feed/historyz(https://www.youtube.com/playlist?list=WLconfigbrowser_profile_ytd      c                  V    SSK n SSKJn  U R                  " S0 UR	                  5       D6$ )zConnect to Elestio PostgreSQL.r   N)PostgresConfig )psycopg2elestio_configr   connectget_connection_params)r   r   s     6E:/genesis-system/scripts/youtube_watchlist_scraper.pyget_db_connectionr   L   s%    -EnBBDEE    returnc                     SnU R                  5        nUR                  U5        SSS5        U R                  5         [        R	                  S5        g! , (       d  f       N4= f)z0Create youtube_intel table if it does not exist.a  
        CREATE TABLE IF NOT EXISTS youtube_intel (
            id            SERIAL PRIMARY KEY,
            video_id      VARCHAR(20)  NOT NULL,
            title         TEXT         NOT NULL,
            channel_name  TEXT,
            channel_url   TEXT,
            video_url     TEXT,
            source        VARCHAR(30)  NOT NULL,  -- 'watch_history' | 'watch_later'
            position      INTEGER,                -- order in list (1 = most recent/top)
            thumbnail_url TEXT,
            duration_text TEXT,                   -- raw text like "12:34" or "1:02:45"
            scraped_at    TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
            transcript_ingested BOOLEAN DEFAULT FALSE,
            UNIQUE (video_id, source)
        );

        CREATE INDEX IF NOT EXISTS idx_yt_intel_video_id
            ON youtube_intel (video_id);
        CREATE INDEX IF NOT EXISTS idx_yt_intel_source
            ON youtube_intel (source);
        CREATE INDEX IF NOT EXISTS idx_yt_intel_scraped_at
            ON youtube_intel (scraped_at);
        CREATE INDEX IF NOT EXISTS idx_yt_intel_ingested
            ON youtube_intel (transcript_ingested);
    Nzyoutube_intel schema ready.)cursorexecutecommitloggerinfo)connsqlcurs      r   ensure_youtube_intel_schemar'   S   sE    C4 
#C 
KKM
KK-. 
s   A
A!recordsc           	      
   U(       d  gSnSnU R                  5        nU H?  n UR                  X%5        UR                  5       nU(       a  US   (       a  US-  nM=  M?  MA     SSS5        U R                  5         [        R                  S[        U5       S	U S
35        U$ ! [         aE  n[        R                  SUR                  S5       SU 35        U R                  5          SnAM  SnAff = f! , (       d  f       N= f)zCUpsert video records into youtube_intel. Returns count of new rows.r   au  
        INSERT INTO youtube_intel
            (video_id, title, channel_name, channel_url, video_url,
             source, position, thumbnail_url, duration_text)
        VALUES
            (%(video_id)s, %(title)s, %(channel_name)s, %(channel_url)s,
             %(video_url)s, %(source)s, %(position)s, %(thumbnail_url)s,
             %(duration_text)s)
        ON CONFLICT (video_id, source) DO UPDATE SET
            title         = EXCLUDED.title,
            channel_name  = EXCLUDED.channel_name,
            position      = EXCLUDED.position,
            scraped_at    = NOW()
        RETURNING (xmax = 0) AS is_new
       zUpsert failed for video_id: Nz	Upserted z
 records (z new).)r   r    fetchone	Exceptionr"   warninggetrollbackr!   r#   len)r$   r(   r%   	new_countr&   recrowes           r   upsert_video_recordsr7   u   s    C  I	#CC%lln3q6NI "3	  
 	KKM
KK)CL>I;fEF  !3CGGJ4G3H1#NO 
s4   C47B"C4"
C1,:C,&C4,C11C44
Dsourcelimitc           
         S/n/ nU(       a"  UR                  S5        UR                  U5        SR                  U5      nSU S3nUR                  U5        U R                  5        nUR                  Xd5        UR                   Vs/ s H  oS   PM	     n	nUR                  5        V
s/ s H  n
[        [        X5      5      PM     sn
sSSS5        $ s  snf s  sn
f ! , (       d  f       g= f)zLReturn video records not yet ingested, ordered by position (priority first).ztranscript_ingested = FALSEzsource = %sz AND zt
        SELECT video_id, title, channel_name, source, position, video_url
        FROM youtube_intel
        WHERE z
        ORDER BY
            CASE source WHEN 'watch_later' THEN 0 ELSE 1 END,
            position ASC NULLS LAST
        LIMIT %s
    r   N)appendjoinr   r    descriptionfetchalldictzip)r$   r8   r9   
conditionsparamswherer%   r&   dcolsr5   s              r   get_pending_video_idsrF      s    /0JF-(fLL$E g C MM%	#C !oo.o!o.03?S^$? 
.? 
s*   ' CCC)CC
C
C+urlc                     U (       d  g/ SQnU H3  n[         R                  " X 5      nU(       d  M"  UR                  S5      s  $    g)z5Extract 11-char YouTube video ID from any URL format.N)zL(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})z"youtube\.com/v/([a-zA-Z0-9_-]{11})z'youtube\.com/shorts/([a-zA-Z0-9_-]{11})r*   )researchgroup)rG   patternspatms       r   extract_video_idrO      sA    H
 IIc1771:  r   r+   c                     SU  3$ )Nz https://www.youtube.com/watch?v=r   r+   s    r   build_video_urlrR      s    -hZ88r   c                     SU  S3$ )Nzhttps://i.ytimg.com/vi/z/hqdefault.jpgr   rQ   s    r   build_thumbnail_urlrT      s    $XJn==r   timesdelay_msc                     [        U5       HD  nU R                  S5        U R                  U5        [        R	                  SUS-    SU S35        MF     g)z7Scroll the page multiple times to trigger lazy-loading.z.window.scrollTo(0, document.body.scrollHeight)zScroll r*   /z
 complete.N)rangeevaluatewait_for_timeoutr"   debug)pagerU   rV   is       r   _scroll_pager_      sJ    5\FGh'wqse1UG:67 r   
timeout_msc                 p     U R                  SUS9  g! [         a    [        R                  S5         gf = f)z2Wait until YouTube video renderer elements appear.zKytd-video-renderer, ytd-playlist-video-renderer, ytd-compact-video-renderertimeoutTz.Timed out waiting for YouTube video renderers.F)wait_for_selectorr.   r"   r/   )r]   r`   s     r   _wait_for_youtube_contentre      sD    Y 	 	
  GHs    55c                 N    SU R                   ;   a  gU R                  S5      nUSL$ )z=Return True if the page is showing a logged-in YouTube state.zaccounts.google.comFz'button#avatar-btn, yt-img-shadow#avatarN)rG   query_selector)r]   avatars     r   _check_logged_inri      s/     (  !JKFr   c                     [         R                  SU S35        U R                  [        SSS9  U R	                  S5        [        U 5      (       d  [         R                  S5        / $ U R                  S5      nU(       a`  [        US	5      (       a  UR                  5       OS
nSUR                  5       ;   d  SUR                  5       ;   a  [         R                  S5        [        SUS-  S-   5      n[        XSS9  [        U 5        / nU R                  S5      n[         R                  S[!        U5       S35        [#        USU SS9 GH  u  px UR                  S5      n	U	(       d  UR                  S5      n	U	(       d  M9  U	R%                  S5      =(       d    S
n
['        U
5      nU(       d  UR%                  S5      nU(       d  M  U	R%                  S5      =(       d    U	R                  5       =(       d    S
R)                  5       nS
nS
nUR                  S5      nU(       a8  UR                  5       R)                  5       nUR%                  S5      =(       d    S
nUR                  S5      nS
nU(       a;  UR%                  S 5      =(       d#    UR%                  S!5      =(       d    [+        U5      nU(       a  UR-                  S"5      (       a  [+        U5      nS
nUR                  S#5      nU(       a  UR                  5       R)                  5       nUR/                  UUUU[1        U5      S$UUUS%.	5        GM     [         R                  S([!        U5       S)35        U$ ! [2         a'  n[         R5                  S&U S'U 35         SnAGM@  SnAff = f)*z[
Scrape YouTube Watch History from youtube.com/feed/history.

Returns list of video dicts.
z#Navigating to Watch History (limit=)...domcontentloaded0u  
wait_untilrc     zNot logged in. Please authenticate in the browser first.
Use --profile-dir with a directory where sunvision07@gmail.com is logged in.z%yt-empty-state-renderer, #empty-state
inner_text zhistory is offpausedzFWatch history appears to be paused. Enable it at myaccount.google.com.           rU   rV   zytd-video-rendererFound z' video renderers on Watch History page.Nr*   starta#video-titleza.ytd-video-rendererhrefzdata-video-idtitlerytd-channel-name a, .ytd-channel-name a, a.yt-simple-endpoint[href*='/channel/'], a.yt-simple-endpoint[href*='/@']$img.yt-img-shadow, ytd-thumbnail imgsrcz
data-thumbdata:`span.ytd-thumbnail-overlay-time-status-renderer, ytd-thumbnail-overlay-time-status-renderer spanwatch_history	r+   r~   channel_namechannel_url	video_urlr8   positionthumbnail_urlduration_textz#Error parsing renderer at position r,   Scraped z videos from Watch History.)r"   r#   gotoYOUTUBE_HISTORY_URLr[   ri   errorrg   hasattrrq   lowerr/   maxr_   re   query_selector_allr2   	enumerateget_attributerO   striprT   
startswithr;   rR   r.   r\   )r]   r9   history_offmsgscrolls_neededr(   	renderersr   rendererlink_elr}   r+   r~   r   channel_url_raw
channel_elthumb_elr   r   duration_elr6   s                        r   scrape_watch_historyr      sC    KK5eWDABII!.@%IP$D!![	
 	 %%&MNK*1+|*L*Lk$$&RTsyy{*h#))+.ENNcd ERK!O,Nd; d# %'G''(<=I
KK&Y((OPQ'	&5(9C@	--o>G"112HI((06BD'-H#11/B**73Qw7I7I7KQrXXZE L O!003J )446<<>",":":6"B"Hb  ../UVHM**51 5--l;5*84 
 !M$<$<W$E$E 3H = M"11BK  + 6 6 8 > > @NN$ ,.,X6)$!.!.
 
g DF KK(3w<.(CDEN  	LL>xj1#NO	s&   0MAME,M
N&NNc                    [         R                  SU S35        U R                  [        SSS9  U R	                  S5        [        U 5      (       d  [         R                  S5        / $ U R                  S5      nU(       a  [         R                  S	5        / $ [        S
US-  S-   5      n[        XSS9   U R                  SSS9  U R                  S5      n[         R                  S[        U5       S35        / n[        USU SS9 GH  u  pg UR                  S5      nU(       d  M!  UR!                  S5      =(       d    Sn	[#        U	5      n
U
(       d'  SU	;   a  U	R%                  S5      S   OU	n[#        U5      n
U
(       d  M}  UR!                  S5      =(       d    UR'                  5       =(       d    SR)                  5       nSnSnUR                  S5      nU(       a8  UR'                  5       R)                  5       nUR!                  S5      =(       d    SnUR                  S5      n[+        U
5      nU(       a9  UR!                  S 5      =(       d    SnU(       a  UR-                  S!5      (       d  UnSnUR                  S"5      nU(       a  UR'                  5       R)                  5       nUR/                  U
UUU[1        U
5      S#UUUS$.	5        GM     [         R                  S'[        U5       S(35        U$ ! [         a    [         R                  S5        / s $ f = f! [         a'  n[         R3                  S%U S&U 35         SnAGMD  SnAff = f))z
Scrape YouTube Watch Later playlist from youtube.com/playlist?list=WL.

Returns list of video dicts, position 1 = top of queue (highest priority).
z*Navigating to Watch Later playlist (limit=rk   rl   rm   rn   rp   z2Not logged in. Cannot access Watch Later playlist.zyt-empty-state-rendererz)Watch Later playlist appears to be empty.rt      rv   rw   rx   zytd-playlist-video-rendereri'  rb   z<No playlist video renderers found. Watch Later may be empty.ry   z) video renderers in Watch Later playlist.Nr*   rz   r|   r}   rr   &r   r~   r   r   r   r   r   watch_laterr   z&Error parsing WL renderer at position r,   r   z videos from Watch Later.)r"   r#   r   YOUTUBE_WATCH_LATER_URLr[   ri   r   rg   r   r_   rd   r.   r/   r   r2   r   r   rO   splitrq   r   rT   r   r;   rR   r\   )r]   r9   empty_elr   r   r(   r   r   title_elr}   r+   
clean_hrefr~   r   r   r   r   r   r   r   dur_elr6   s                         r   scrape_watch_laterr   i  s    KK<UG4HIII%2DeIT$D!!IJ	 ""#<=H?@	 ERK!O,Nd;<eL
 ''(EFI
KK&Y((QRS$&G'	&5(9C;	..?H))&17RD'-H36$;TZZ_Q/D
+J7++G4S8K8K8MSQSZZ\E L O!00\J )446<<>",":":6"B"Hb  ../UVH/9M,,U39rs~~g66$'M M,,BF  & 1 1 3 9 9 ;NN$ ,.,X6'$!.!.
 
] D| KK(3w<.(ABCNQ  UV	F  	LLA(2aSQR	s8   3K6 L&ALEL6!LL
M'M		MFsourceshistory_limitwl_limitprofile_dircdp_urldry_runoutput_filec           	      	    SSK Jn  [        U5      R                  SSS9  / nW" 5        n	U(       a  [        R                  SU 35         U	R                  R                  U5      n
U
R                  (       a  U
R                  S   OU
R                  5       nUR                  (       a  UR                  S   OUR                  5       nU(       di  [        R                  SU 35        U	R                  R#                  US/ SQSSS.S9nUR                  (       a  UR                  S   OUR                  5       n [        R                  S5        WR%                  SSSS9  UR'                  S5        [)        U5      (       d  [        R+                  S5        SU ;   a<  [-        XS9nUR/                  U5        [        R                  S[1        U5       S35        SU ;   a<  [3        XS9nUR/                  U5        [        R                  S[1        U5       S35        U(       d  WR5                  5         [        R                  S5         S
S
S
5        [1        U5      n[        R                  S U 35        U(       ap  [        U5      nUR6                  R                  SSS9  [9        US!S"S#9 n[:        R<                  " UUS$S[>        S%9  S
S
S
5        [        R                  S&U S'U 35        SnU(       d9  U(       a2   [A        5       n[C        U5        [E        UU5      nUR5                  5         0 nU H!  nURG                  US)   S5      S-   UUS)   '   M#     UUUU Vs/ s H  nUS*   PM
     sn[H        RJ                  " [L        RN                  5      RQ                  5       US+.n[S        [:        RT                  " US$S,95        U$ ! [         a/    [        R	                  S5        [
        R                  " S5         GNf = f! [          a:  n[        R	                  SU 35        [        R                  S	5        S
n S
nAGNsS
nAff = f! U(       d  WR5                  5         [        R                  S5        f = f! , (       d  f       GN5= f! , (       d  f       GN= f! [          a#  n[        R	                  S(U 35         S
nAGNS
nAff = fs  snf )-a  
Run the watchlist scraper and persist results.

Args:
    sources:       List of 'history' and/or 'watchlater'
    history_limit: Max videos to scrape from Watch History
    wl_limit:      Max videos to scrape from Watch Later
    profile_dir:   Chromium persistent profile with Google login
    cdp_url:       CDP URL for connecting to existing Chrome (e.g. http://localhost:9222)
    dry_run:       If True, print results without storing to DB
    output_file:   Optional path to save JSON output

Returns:
    Summary dict
r   )sync_playwrightz[Playwright is required.
Install with: pip install playwright && playwright install chromiumr*   T)parentsexist_okz'Connecting to existing Chrome via CDP: zCDP connection failed: z,Falling back to persistent profile launch...Nz!Launching Chromium with profile: F)z---disable-blink-features=AutomationControlledz--disable-dev-shm-usagez--no-sandboxi   i   )widthheight)user_data_dirheadlessargsviewportz Checking authentication state...zhttps://www.youtube.comrl   i N  rn   i  zBrowser is not logged in to Google.
Please log in manually and re-run, or use --profile-dir with a directory where you are already logged in as sunvision07@gmail.com.history)r9   zWatch History: z videos scraped.
watchlaterzWatch Later: zBrowser context closed.zTotal videos scraped: wzutf-8)encodingrv   )indentensure_asciidefaultzSaved z records to zDatabase write failed: r8   r+   )total_scraped	new_to_db	by_source	video_ids
scraped_atr   )r   )+playwright.sync_apir   ImportErrorr"   r   sysexitr   mkdirr#   chromiumconnect_over_cdpcontextsnew_contextpagesnew_pager.   launch_persistent_contextr   r[   ri   r/   r   extendr2   r   closeparentopenjsondumpstrr   r'   r7   r0   r   nowr   utc	isoformatprintdumps)r   r   r   r   r   r   r   r   all_recordspbrowsercontextr]   r6   history_records
wl_recordstotalout_pathfr3   r$   r   r4   rsummarys                            r   run_scraperr     s
   07 	D48(*K		aKKA'KL**55g>181A1A'**1-wGZGZG\+2==w}}Q'g>N>N>P KK;K=IJjj::)
 $(37 ; 	G (/}}7==#':J:J:LD	3KK:;II/<NX]I^!!$'#D))\ G#"6t"Q""?3oc/.B-CCSTUw&/E
"":.mC
O+<<LMN KK12s 
x E
KK(01 $dT:(C'2aIIk1QUCP 3fUG<z:; I{	8$&D'-,T;?IJJL
 !#I#,==X#BQ#F	#h-   -89[a
m[9ll8<<0::<G 
$**WQ
'(N[  R	
 	(  6qc:;JK^ KK12s 
	F 32  	8LL21#677	8 :s   N>  Q2A;O:A0Q27C Q,Q2&R71R S
>5O76O7:
P>/P93Q29P>>Q2.Q//Q22
R
R
S R>>Sc            
         [         R                  " S[         R                  S9n U R                  S/ SQSSS9  U R                  S[        [
        S	[
         S
3S9  U R                  S[        [        S[         S
3S9  U R                  S[        S[         S
3S9  U R                  SSS9  U R                  SSSS9  U R                  SSS9  U R                  5       nUR                  S:X  a  SS/nOUR                  /n[        UUR                  UR                  UR                  UR                  UR                  UR                   S9  g )NzScrape YouTube Watch History and Watch Later using Playwright.
Requires browser already authenticated as sunvision07@gmail.com.)r=   formatter_classz--source)r   r   bothr   z'Which list(s) to scrape (default: both))choicesr   helpz--history-limitz-Max Watch History videos to scrape (default: ))typer   r   z
--wl-limitz+Max Watch Later videos to scrape (default: z--profile-dirz0Chromium persistent profile directory (default: )r   r   z	--cdp-urlz?Connect to existing Chrome via CDP (e.g. http://localhost:9222))r   z	--dry-run
store_truez#Scrape but do not write to database)actionr   z--output-filez&Save scraped records to this JSON filer   r   )r   r   r   r   r   r   r   )argparseArgumentParserRawDescriptionHelpFormatteradd_argumentintDEFAULT_HISTORY_LIMITDEFAULT_WL_LIMITDEFAULT_PROFILE_DIR
parse_argsr8   r   r   r   r   r   r   r   )parserr   r   s      r   mainr  a  s   $$O !<<F 16	   %<=R<SSTU	    :;K:LAN	   #?@S?TTUV  
 N   2  
 5  
 D {{fl+;;-(($$$$r   __main__)r   N)Ni  )   i  )i:  )/__doc__r   r   loggingosrI   r   timer   r   pathlibr   typingr   r   r   r	   GENESIS_ROOTpathinsertr   basicConfigINFO	getLoggerr"   r   r   r   r   r   r   r'   r   r7   rF   rO   rR   rT   r_   boolre   ri   r   r   r   r  __name__r   r   r   <module>r     sz  #J    	 	 
  '  , ,
 '( 3|f,/??@ A   
,,>
 
		1	2
 = D ,14HHI   F/D#T#s(^(< # #L@ @S @SWX\S] @># (3-  9c 9c 9># ># >8c 8 8 8
 
 
d  -B ic id4PSUXPX>FZ i` +; cC ctDcN?S cX /$*!!%G#YGG G 	G
 c]G G #G 
#s(^G\=@ zF r   