
    ğiB                     (   d Z ddlZddlZddlZddlZddlmc mZ ddl	m	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlZddlmZmZ  ej(                  e      Z	 	 	 d(ded	ed
edeej2                     def
dZddiZdedee   fdZdedee   fdZ	 	 	 	 d)dee   deded	ededee   fdZ	 	 	 d*dededededee   f
dZ dee   dee   dee   dee   fdZ!	 d+d ed
ede"e   fd!Z# ejH                  d"      Z%d#edefd$Z&d%ee   d&e'eef   dee   fd'Z(y),at  
MODULE 1: URL Fetcher
======================
Fetches web pages, parses sitemaps, handles concurrency, deduplication,
robots.txt, and optional Browserless fallback.

Stories:
  1.01 - HTTP Fetcher (aiohttp)
  1.02 - Sitemap Parser (XML)
  1.03 - Concurrent Page Fetcher (Semaphore + polite delay)
  1.04 - Browserless Fallback Fetcher (Playwright stub)
  1.05 - URL Filter and robots.txt
  1.06 - Content Hash Deduplication
  1.07 - Integration Tests (see tests/kb/test_m1_fetcher_integration.py)

VERIFICATION_STAMP
Story: 1.01 - 1.07
Verified By: parallel-builder
Verified At: 2026-02-26T00:00:00Z
Tests: 15/15
Coverage: ~95%
    N)datetimetimezonefnmatch)Optional)urlparse)FetchedPagePlatformConfigurltimeout
user_agentsessionreturnc           	        K   t        j                  t        j                        j	                  d      }d|i}|du }	 |rBt        j                  d      }t        j                  ||t        j                  |            }	 |j                  |       4 d{   }|j                  }	|j                  j                  dd	      }
t        |j                        }	 |j                  d
       d{   }t        | ||	|
||      cddd      d{    |r||j!                          d{    S S S 7 7 F# t        $ r d	}Y Qw xY w7 87 # 1 d{  7  sw Y   nxY wn# t"        j$                  $ rI t&        j)                  d|        t        | d	dd	i |      cY |r||j!                          d{  7   S S S t
        j*                  $ rQ}t&        j)                  d| |       t        | d	dd	i |      cY d}~|r||j!                          d{  7   S S S d}~ww xY w	 |r||j!                          d{  7   yyy# |r||j!                          d{  7   w w w xY ww)u  Fetch a single URL and return FetchedPage.

    Handles HTTP errors (4xx, 5xx) gracefully — returns FetchedPage with
    error status and empty html.  Callers may pass an existing session for
    connection reuse; if none is supplied a temporary session is created.
    %Y-%m-%dT%H:%M:%SZz
User-AgentNF)ssltotal)	connectorheadersr   zContent-Type replaceerrorsr   htmlstatus_codecontent_typer   
fetched_atzTimeout fetching %si  zClient error fetching %s: %sr   )r   nowr   utcstrftimeaiohttpTCPConnectorClientSessionClientTimeoutgetstatusr   dicttext	Exceptionr	   closeasyncioTimeoutErrorloggerwarningClientError)r   r   r   r   r   r   _owns_sessionr   respr   r   resp_headersr   excs                 (/mnt/e/genesis-system/core/kb/fetcher.py
fetch_pager7   -   sp     hll+445IJJZ(GtOM0",,7I++#--G<G$	{{3'  4"kk#||//C#DLL1!%)!<<D # +!-()  J W0--/!! 1=K
 =  DL "M     ## 		NN0#6% & W0--/!! 1= "" 		NN93D%  W0--/!! 1=		5J W0--/!! 1==W0--/!! 1=s+  ;I*AI E$ D5E$ >ED90D71D95EE$ E
E$ I*-E.I*5E$ 7D99EEEE
E$ I*E EE E$ #I $:H"I I*6F97I*?H"'H8H"9I =I*HI*H""I &I*=I >I*I'I!I''I*smz+http://www.sitemaps.org/schemas/sitemap/0.9xml_contentc                    | r| j                         sg S 	 t        j                  |       }|j                  }d|v r|j                  d      d   n|}g }|dk(  r|j                         D ]r  }d|j                  v r|j                  j                  d      d   n|j                  }|dk(  sA|j                  xs dj                         }|sb|j                  |       t n|j                         D ]r  }d|j                  v r|j                  j                  d      d   n|j                  }	|	dk(  sA|j                  xs dj                         }|sb|j                  |       t t               }
g }|D ])  }||
vs|
j                  |       |j                  |       + |S # t        j                  $ r t        j                  d       g cY S w xY w)zExtract URLs from sitemap XML.

    Handles:
    - Standard <urlset> sitemaps (<url><loc>)
    - Sitemap index files (<sitemapindex><sitemap><loc>)
    - Malformed XML (returns empty list)
    u.   Malformed sitemap XML — returning empty list}sitemapindexlocr   )stripET
fromstring
ParseErrorr/   r0   tagsplititerr*   appendsetadd)r9   rootrC   localurls
sitemap_elsitemap_localr>   url_el	url_localseendedupedus                r6   _parse_sitemap_xmlrS   y   s    k//1	}}[) ((C"%*CIIcN2#ED))+ 	%J=@JNN=RJNN005b9XbXfXfM%!,"335KK$	% iik 	%F58FJJ5F

((-b1FJJIE!{{(b//1KK$	% UDG D=HHQKNN1 NE == GH	s   F *GGsitemap_urlc                   K   t        |        d{   }|j                  dk7  s|j                  s#t        j	                  d| |j                         g S t        |j                        }|sg S 	 t        j                  |j                        }d|j                  v r|j                  j                  d      d   n|j                  }|dk(  }|rVg }t               }|D ]C  }t        |       d{   }	|	D ])  }
|
|vs|j                  |
       |j                  |
       + E |S |S 7 # t        j                  $ r d}Y tw xY w7 Vw)zFetch and parse XML sitemap, return list of URLs.

    Handles sitemap index files by recursively fetching child sitemaps.
    Returns deduplicated list of page URLs (not sitemap URLs).
    N   z&Failed to fetch sitemap %s (status %d)r;   r<   r=   F)r7   r   r   r/   r0   rS   r@   rA   rC   rD   rB   rG   fetch_sitemaprH   rF   )rT   pageraw_urlsrI   rJ   is_indexall_page_urlsrP   	child_urlchild_pagesrR   s              r6   rW   rW      s0     K((D3dii?dN^N^_	!$)),H	
}}TYY'+.$((?s#B'^+ #%! 	,I -i 88K  ,D=HHQK!((+,	, O? )  ==  9sG   ED$AE.AD' 
!E+E,E:+E'D?<E>D??ErK   concurrencydelay_ms
batch_sizec                 t  K   g }|dz  t        |       }t        d||      D ]K  }| |||z    }t        j                  |      t	        j
                  d|dz         }	t	        j                  |	t	        j                              4 d{   d	t        d
t        ffd}
|D cg c]
  } |
|       }}t        t        j                  |  d{         }|j                  |       ddd      d{    |t              z   }t        d |D              }t        j                  d|dz   ||||       ddl}t#        d| d| d| dt        |       d	|j$                  d       N t        j                  dt        |      |       |S 7 c c}w 7 7 # 1 d{  7  sw Y   xY ww)a2  Fetch multiple URLs with bounded concurrency, polite delay, and batched progress.

    Uses asyncio.Semaphore to limit concurrent connections. Adds a polite
    delay between each request to avoid hammering servers.  Processes URLs
    in batches of *batch_size* to limit peak memory and log progress.
         @@r   F   )r   limitr   )r   r   Nr   r   c                    K   4 d {    t        |        d {   }dkD  rt        j                         d {    |cd d d       d {    S 7 O7 ;7 7 # 1 d {  7  sw Y   y xY ww)N)r   r   r   )r7   r-   sleep)r   rX   delay_s	semaphorer   r   s     r6   
_fetch_onezfetch_pages.<locals>._fetch_one   sn     $    !+C'!RRD{%mmG444	     R4       sf   A7AA7A"A!A"AA"A7A A7A"A" A7"A4(A+)A40A7c              3   @   K   | ]  }|j                   d k(  sd  yw)rV      N)r   ).0ps     r6   	<genexpr>zfetch_pages.<locals>.<genexpr>   s     BqQ]]c-ABs   z=fetch_pages: batch %d-%d done (%d/%d total, %d ok this batch)rk   z  [fetch-batch] /z
 fetched (z ok)T)fileflushz+fetch_pages: fetched %d URLs (%d requested))lenranger-   	Semaphorer#   r$   r%   r&   strr	   listgatherextendsumr/   infosysprintstderr)rK   r^   r_   r   r`   all_resultsr   batch_start
batch_urlsr   ri   r   tasksbatch_resultsdoneokr{   rg   rh   r   s      `             @@@r6   fetch_pagesr      s     &(KGIEQz2  
+j(@A
%%k2	((U+/J	(())8
 	. 	.  c  k     1;;Z_;E; w~~u'=!=>M}-	. 	.  S//BMBBK!OT4	

 	tfAeWJrd!C<N;OtT4	
; 
D KK=s;?OQVW=	. <!=	. 	. 	. 	.sg   B
F8FF8F#.F=F#FF#3F8>F!?BF8F#!F8#F5	)F,*F5	1F8browserless_urlbrowserless_tokenwait_msc           	        K   	 ddl m}  |       4 d{   }|r| d| n|}	 |j                  j                  |       d{   }|j                          d{   }		 t        j                  t        j                        j                  d      }
|	j                  | d       d{   }t        j                  |d	z         d{    |	j!                          d{   }|r|j"                  nd
}d}|r*|j%                          d{   }|j'                  dd      }t)        | |||i |
      |	j+                          d{    |j+                          d{    cddd      d{    S 7 b7 6# t        $ r2}t
        j                  d| |       Y d}~ddd      d{  7   yd}~ww xY w7 _7 7 7 7 7 ~7 h7 Y# |	j+                          d{  7   |j+                          d{  7   w xY w# 1 d{  7  sw Y   yxY w# t,        $ r t
        j/                  d       Y yt        $ r!}t
        j                  d| |       Y d}~yd}~ww xY ww)u   Fetch a URL using headless Chrome via Browserless CDP.

    Falls back gracefully if Playwright/Browserless is unavailable — returns None.
    Primary path is aiohttp fetcher; this is the fallback for SPAs.
    r   )async_playwrightNz?token=z%Browserless connect failed for %s: %sr   i0u  )r   rb   rV   r   zcontent-typer   u=   Playwright not installed — Browserless fallback unavailablez#Browserless fetch failed for %s: %s)playwright.async_apir   chromiumconnect_over_cdpr+   r/   r0   new_pager   r    r   r!   r"   gotor-   rf   contentr(   all_headersr'   r	   r,   ImportErrordebug)r   r   r   r   r   pwws_endpointbrowserr5   rX   r   r3   r   r   r   headers_dicts                   r6   fetch_page_browserlessr     s    '9#% 	& 	&L]_-W5F4GHcrK " < <[ II
 !))++D&%\\(,,7@@AUV
!YYsFY;;mmGf$4555!\\^+-1dkks!)-)9)9);#;L#/#3#3NB#GL" +!-) jjl""mmo%%;	& 	& 	& J FSQ	& 	& 	& , <5+ $< #%;	&8 jjl""mmo%%;	& 	& 	&>  TU <c3Gs  I%H E7H HE=E:E=	HF;H"AG*F>+G
GG"G#+GG&G5HG	H G	!H%H 1G2H 6I%7H :E==	F8F3H!H ,F/-H 2I%3F88H>GGGGH	HH H!G$
"H:G=
;HHH
HHH I%H I"7I%9I"II%I""I%include_patternsexclude_patternsc                     g }| D ]`  t              }|j                  xs d|rt        fd|D              }|s8t        fd|D              }|rP|j                         b |S )zFilter URLs by include/exclude glob patterns.

    Patterns are matched against the URL path (not the full URL).
    If include_patterns is empty, all URLs are included by default.
    Exclude patterns take precedence over include patterns.
    ro   c              3   R   K   | ]  }t        |      xs t        |         y wNr   rl   patpathr   s     r6   rn   zfilter_urls.<locals>.<genexpr>]  s/       S!7WT3%77   $'c              3   R   K   | ]  }t        |      xs t        |         y wr   r   r   s     r6   rn   zfilter_urls.<locals>.<genexpr>e  s/      
 C3s!33
r   )r   r   anyrF   )	rK   r   r   resultparsedincludedexcludedr   r   s	          @@r6   filter_urlsr   J  s     F #{{!c  + H   
'
 
 c-. M    base_urlc                   K   t        |       }|j                   d|j                   d}t        |       d{   }|j                  dk7  s|j
                  s
t               S t               }g }|j                         |j
                  j                         D ]  }|j                         }|r|j                  d      r'd|vr,|j                  d      \  }	}
}|	j                         j                         }	|j                         }|	dk(  r;|j                  d      D cg c]   }|j                         j                         " }}|	d	k(  st        fd
|D              }|s|s|j                  |        |S 7 Dc c}w w)zFetch and parse robots.txt, return set of disallowed paths.

    Handles:
    - User-agent specific rules (matched case-insensitively)
    - Wildcard user-agent (*)
    - Missing robots.txt (returns empty set)
    z://z/robots.txtNrV   #:z
user-agent,disallowc              3   \   K   | ]#  }|d k(  xs |k(  xs j                  |       % yw)*N)
startswith)rl   aua_lowers     r6   rn   z#check_robots_txt.<locals>.<genexpr>  s9       SCAMCX-@-@-CCs   ),)r   schemenetlocr7   r   r   rG   lower
splitlinesr?   r   	partitionrD   r   rH   )r   r   r   
robots_urlrX   
disallowedcurrent_agentsraw_lineline	directive_valuer   appliesr   s                 @r6   check_robots_txtr   p  s]     hFMM?#fmm_K@JJ''D3diiu5J "N!HII((* &~~ts+d?"nnS1	1eOO%++-	$9>S9IJAaggioo/JNJ*$ ' G 5u%+&. ? (* Ks4   6FE=C&F %F FF%F(F Fz\s+r   c                     t         j                  d|       j                         }t        j                  |j                  dd            j                         S )zSHA-256 hash of normalized HTML content.

    Normalization collapses all whitespace runs to a single space and
    strips leading/trailing whitespace, so minor formatting changes do
    not produce a different hash.
     zutf-8r   r   )_WHITESPACE_REsubr?   hashlibsha256encode	hexdigest)r   
normalizeds     r6   compute_content_hashr     sG      ##C.446J>>*++GI+FGQQSSr   pagesknown_hashesc                    K   g }| D ]i  }t        |j                        }|j                  |j                        }||k7  r|j	                  |       Jt
        j                  d|j                         k |S w)u   Filter out pages whose content hash matches known hashes.

    Returns only changed or new pages.

    Args:
        pages:        List of freshly fetched FetchedPage objects.
        known_hashes: Mapping of url → sha256 hash from the PG store.
    zSkipping unchanged page: %s)r   r   r'   r   rF   r/   r   )r   r   changedrX   current_hashprevious_hashs         r6   filter_unchangedr     sn      "$G B+DII6$((2=(NN4 LL6AB Ns   A3A5)   zGenesisBot/1.0N)   rV   r   2   )z/wss://browserless-genesis-u50607.vm.elestio.appr   i  )
GenesisBot))__doc__r-   r   loggingrexml.etree.ElementTreeetreeElementTreer@   r   r   r   typingr   urllib.parser   r#   core.kb.contractsr	   r
   	getLogger__name__r/   ru   intr%   r7   _SITEMAP_NSrv   rS   rW   r   r   r   rG   r   compiler   r   r)   r    r   r6   <module>r      s'  .    	 " " '   !  9			8	$ &/3	@"	@"@" @" g++,	@"
 @"P 	
7
.C .DI .b%S %T#Y %\ 4
s)44 4 	4
 4 
+4z M	2	22 2 	2
 k2r#
s)#3i# 3i# 
#Y	#P #--- 	X-h F#Ts Ts TsCx. 
+r   