
    	&i6                     8   d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZ ddlmZmZ ddlZ ej                   d      ZdZej&                  j)                  dd	      Zd
diZ e	d      Zd-dededee   fdZd.dededee   fdZd.dededee   fdZdede	dee	   fdZdede	dee	   fdZdedee   fdZ d/dedee	   defdZ!e"dk(  r ejF                  ejH                  d        e%ejL                        dkD  rejL                  d   ndZ' e	d      Z(e(jS                  d d !        e!e'e(      Z* e+d"        e+d#e*d$            e+d% e%e*d&                  e+d' e%e*d(                  e+d)e*d*            e+d+e*d&   dd,         yy)0u  
Genesis Clone Pipeline — Content Extraction Module
===================================================
Extracts content from target websites using a layered fallback approach:
  Layer 1: Jina Reader API (free, fast, LLM-ready markdown)
  Layer 2: Crawl4AI (open-source, deep crawl, images/CSS/structure)
  Layer 3: Playwright screenshot (visual reference)
  Layer 4: Raw requests fallback (basic HTML)

No SQLite. Output is file-based or passed in-memory.
    N)Path)Optional)urlparseurljoinzclone_pipeline.extractorzhttps://r.jina.ai/JINA_API_KEY z
User-AgentzoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36z3/mnt/e/genesis-system/scripts/clone_pipeline/outputurltimeoutreturnc                 8   t          |  }ddd}t        rdt         |d<   	 t        j                  d|         t	        j
                  |||      }|j                          |j                         }t        |t              r|j                  di       j                  d	      xs$ |j                  d      xs |j                  d	      }t        |t              r2t        |      d
kD  r$t        j                  dt        |       d       |S |j                  r$t        |j                        d
kD  r|j                  S t        j                  dt        |      dd         y# t        j                  j                  $ r t        j                  d| d|         Y yt        j                  j                   $ r9}t        j                  d|j"                  j$                   d|         Y d}~yd}~wt&        $ r"}t        j                  d|        Y d}~yd}~ww xY w)u   
    Extract page content as clean Markdown via Jina Reader API.

    Prepends https://r.jina.ai/ to the target URL — no SDK required.
    Returns markdown string on success, None on failure.
    zapplication/jsonmarkdown)AcceptzX-Return-FormatzBearer Authorizationz[Jina] Fetching: headersr
   datacontentd   z[Jina] Extracted z charsz&[Jina] Unexpected response structure: N   z[Jina] Timeout after zs for z[Jina] HTTP z for z[Jina] Failed: )	JINA_BASEr   loggerinforequestsgetraise_for_statusjson
isinstancedictstrlentextwarning
exceptionsTimeout	HTTPErrorresponsestatus_code	Exception)r	   r
   jina_urlr   respr   r   es           9/mnt/e/genesis-system/scripts/clone_pipeline/extractor.pyextract_with_jinar-   2   s    SE"H$%G %,\N#;  'u-.||HgwGyy{dD!$((3 '88F#'88I& 
 '3'CL3,>/G~VDE 99TYY#-99?D	$3?PQR&& .wivcUCD(( ajj&<&<%=U3%HI ,-s6   C(E! /E! <$E! !8HH7/G++H7HH	max_pagesc           
      v  K   	 ddl m}m}m} ddlm}  |dd      } |ddd      }d	g g i dd
} ||      4 d{   }	|	j                  | |       d{   }
|
j                  r7|
j                  xs d	|d<   |
j                  xs i |d<   d|d<   |
j                  rJ|
j                  j                  dg       D ]+  }|j                  dd	      }|s|d   j                  |       - |
j                  rc|
j                  j                  dg       }|dd D cg c]5  }|j                  dd	      j                  d      r|j                  dd	      7 c}|d<   t        j!                  dt#        |d          dt#        |d          d       |cddd      d{    S ddd      d{    t        j%                  d       y7 7 rc c}w 7 67 '# 1 d{  7  sw Y   7xY w# t&        $ r t        j%                  d       Y yt(        $ r"}t        j%                  d|        Y d}~yd}~ww xY ww)z$Internal async crawl using Crawl4AI.r   )AsyncWebCrawlerBrowserConfigCrawlerRunConfig)LLMExtractionStrategyTF)headlessverbose2   )word_count_thresholdexclude_external_linksremove_overlay_elementsr   )r   imageslinksmetadatapages_crawled)configN)r	   r>   r   r<      r=   r:   srcinternal   hrefhttpr;   z[Crawl4AI] OK: z chars, z imagesz%[Crawl4AI] Crawler returned no resultz3[Crawl4AI] Not installed. Run: pip install crawl4aiz[Crawl4AI] Failed: )crawl4air0   r1   r2   crawl4ai.extraction_strategyr3   arunsuccessr   r<   mediar   appendr;   
startswithr   r   r    r"   ImportErrorr(   )r	   r.   r0   r1   r2   r3   browser_cfgrun_cfgresult_datacrawlerresultimgr@   rA   lnkr+   s                   r,   _crawl4ai_asyncrT   h   s5    8MMF#T5A"!##($(
 
 #+6 	# 	#'"<<C<@@F~~*0//*?RJ'*0//*?RJ'/0O, <<%||//"= >!ggeR0'188=> <<%||//
B?H7?},037762.99&A +,K(
 %c+j*A&B%C8;x012'; #7	# 	# 	# 	# 	#: 	>?=	#@",%	# 	# 	# 	# 	#@  LM ,QC01s   H9:G, GG, GGA;GAG:G;GG, GG, !H9"G, -G.G, H9G, GGG, G, G)G G)%G, ,H6
H9H6H1,H91H66H9c                 b   	 t        j                         }|j                         rdddl}|j                  j                         5 }|j                  t         j                  t        | |            }|j                  d      cddd       S |j                  t        | |            S # 1 sw Y   yxY w# t        $ rt}t        j                  d|        	 t        j                  t        | |            cY d}~S # t        $ r&}t        j                  d|        Y d}~Y d}~yd}~ww xY wd}~ww xY w)aZ  
    Deep crawl a website using Crawl4AI.

    Returns dict with keys:
      - markdown: str (extracted text as markdown)
      - images: list[str] (image URLs)
      - links: list[str] (internal page links)
      - metadata: dict (title, description, etc.)
      - pages_crawled: int

    Returns None if Crawl4AI is not installed or fails.
    r   N<   r
   z[Crawl4AI] Event loop error: z"[Crawl4AI] Final fallback failed: )asyncioget_event_loop
is_runningconcurrent.futuresfuturesThreadPoolExecutorsubmitrunrT   rQ   run_until_completer(   r   r"   )r	   r.   loop
concurrentpoolfuturer+   e2s           r,   crawl_with_crawl4airf      s    %%'??%##668 1DKKi!@ }}R}0	1 1 **?3	+JKK1 1  6qc:;	;;sI>?? 	NN?tDE4		sf   AB1 <B% 	B1 
B1 %B.*B1 .B1 1	D.:D)C71D.7	D& D!D)!D&&D))D.output_pathc                 `  K   	 ddl m}  |       4 d{   }	 |j                  j                  dddg       d{   }|j                  d
dd       d{   }|j                  | dd       d{    t        j                  d       d{    |dz  }|j                  t        |      d       d{    |j                          d{    t        j                  d|        |cddd      d{    S 7 7 # t        $ r, |j                  j                  ddddg	       d{  7  }Y w xY w7 7 7 7 7 s7 K# 1 d{  7  sw Y   yxY w# t        $ r t        j                  d       Y yt        $ r"}t        j                  d|        Y d}~yd}~ww xY ww)z-Take a full-page screenshot using Playwright.r   )async_playwrightNTz--no-sandboxz--disable-dev-shm-usage)r4   argsz/usr/bin/chromium-browser)r4   executable_pathrj   i  i  )widthheight)viewporti0u  networkidle)r
   
wait_untilr?   zscreenshot_original.png)path	full_pagez[Playwright] Screenshot saved: z[Playwright] Not installedz [Playwright] Screenshot failed: )playwright.async_apiri   chromiumlaunchr(   new_pagegotorX   sleep
screenshotr   closer   r   rL   r"   )r	   rg   ri   pbrowserpagescreenshot_pathr+   s           r,   _playwright_screenshot_asyncr      s    $9#% 	# 	# !

 1 1!(*CD !2 !  !))#'37 *  D ))C=)III--"""),EEO//s?';t/LLL--/!!KK9/9JKL"5	# 	# 	#   !

 1 1!$?(*CD !2 !   J" M!/	# 	# 	# 	#8  34 9!=>s  F.E! DE! E"DDDEE E6E7EE(E<E=EEE2E! >E
?E! F.E! D,D=4D75D=:E<D==EEEEE
E! EEEE! F.E! !F+?F.F+	F&!F.&F++F.
output_dirc           	         |j                  dd       	 t        j                  t        | |            S # t        $ r ddl}|j                  j                         5 }|j                  t        j                  t        | |            }	 |j                  d      cddd       cY S # t        $ r+}t        j                  d|        Y d}~ddd       Y yd}~ww xY w# 1 sw Y   Y yxY ww xY w)z
    Take a full-page screenshot of a URL using Playwright.

    Saves to output_dir/screenshot_original.png
    Returns the path if successful, None otherwise.
    Tparentsexist_okr   NrV   rW   z%[Playwright] Thread fallback failed: )mkdirrX   r_   r   RuntimeErrorr[   r\   r]   r^   rQ   r(   r   r"   )r	   r   rb   rc   rd   r+   s         r,   screenshot_with_playwrightr      s     TD1{{7ZHII !224 	[[9#zJF}}R}0	 	  !FqcJK	 		 	sQ   4 'C)+CB$
C)$	C-CC	C)CCC%	 C)%C)c                     	 t         j                  d|         t        j                  | t        d      }|j                          |j                  S # t        $ r"}t         j                  d|        Y d}~yd}~ww xY w)za
    Basic HTML extraction via requests as last resort.
    Returns raw HTML string or None.
    z[Raw] Fetching HTML: rB   r   z[Raw] Failed: N)	r   r   r   r   DEFAULT_HEADERSr   r!   r(   r"   )r	   r*   r+   s      r,   extract_raw_htmlr     si    
+C512||C"Eyy s+,s   AA 	A=A88A=c           	      8   ddg g i dd| d}t         j                  d|         t        |       }|r<t        |      dkD  r.||d<   d|d	<   t         j                  d
t        |       d       n]t         j                  d       t	        |       }|r|j                  d      rt        |d         dkD  rr|d   |d<   |j                  dg       |d<   |j                  dg       |d<   |j                  di       |d<   d|d	<   t         j                  dt        |d          d       nt         j                  d       t        |       }|rm||d<   ddl}|j                  dd|      }|j                  dd|      j                         }|dd |d<   d|d	<   t         j                  dt        |       d       nt         j                  d|         |r	 t        | |      }|rt        |      |d<   |S |S # t        $ r#}	t         j                  d|	        Y d}	~	|S d}	~	ww xY w)a&  
    Extract content from URL using layered fallback strategy.

    Priority:
      1. Jina Reader (fast, free, LLM-ready markdown)
      2. Crawl4AI (deep crawl if Jina fails or returns too little)
      3. Raw HTML (last resort for basic content)

    Screenshot is always attempted independently for visual reference.

    Returns dict:
      - markdown: str
      - html: str (raw HTML if available)
      - images: list[str]
      - links: list[str]
      - metadata: dict
      - screenshot_path: str or None
      - extraction_method: str
    r   Nnone)r   htmlr:   r;   r<   r~   extraction_methodr	   z%[Extractor] Starting extraction for:   r   jinar   z[Extractor] Jina succeeded (z chars)z1[Extractor] Jina insufficient, trying Crawl4AI...i,  r:   r;   r<   rE   z [Extractor] Crawl4AI succeeded (z5[Extractor] Crawl4AI insufficient, trying raw HTML...r   r   z<[^>]+> z\s+i@  raw_htmlz[Extractor] Raw HTML fallback (z.[Extractor] ALL extraction methods failed for r~   z [Extractor] Screenshot skipped: )r   r   r-   r    rf   r   r   resubstriperrorr   r   r(   r"   )
r	   r   rQ   jina_content
crawl_datar   r   r!   r~   r+   s
             r,   extract_contentr   #  s9   * #	F KK7u=>$S)LL)C/)z&,"#23|3D2EWMNGH )-
*..4Z
=S9TWZ9Z!+J!7F:)~~h;F8(nnWb9F7O!+
B!?F:*4F&'KK:3vj?Q;R:SSZ[\ KKOP#C(D!%vvvj#t4vvfc40668%)%4[z".8*+=c$i[PQMcUST 	C8jIO,/,@() M6M  	CNN=aSABBM	Cs   G- -	H6HH__main__z%(levelname)s %(message)s)levelformatr?   zhttps://example.comz8/mnt/e/genesis-system/scripts/clone_pipeline/output/testTr   z
Extraction result:z
  Method: r   z  Markdown length: r   z  Images found: r:   z  Screenshot: r~   z
First 500 chars of content:
r   )   )   )N),__doc__ossysr   timeloggingrX   hashlibpathlibr   typingr   urllib.parser   r   r   	getLoggerr   r   environr   r   r   SCREENSHOT_DIRr   intr-   r   rT   rf   r   r   r   r   __name__basicConfigINFOr    argvr	   outputr   rQ   print     r,   <module>r      sB  
 
 
        * 			5	6 !	zz~~nb1 	) KL/3 / /hsm /l:s :s :8D> :zS S $ L&C &d &xPT~ &RC T htn 8# (3- &L L(4. LD Lf zGgll3NOSXX*#((1+0ECLMF
LLL-S&)F	 "	Jv123
45	F:$6 78
9:	S!123
45	N6"345
67	+F:,>t,D+E
FG r   