
    /i+                      d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlmZmZmZ ddl
mZ ddlmZmZmZmZmZmZmZ ddlmZmZ ddlZddlZ	 ddlmZ  dd	lm!Z! d
Z"	 ddl$Z$d
Z%	 ddl&Z&d
Z'ddl(m)Z)m*Z* ddlm+Z+ e+rddl,m-Z- dZ. ed      Z/ ej`                  dejb                        Z2 ej`                  dejb                        Z3 ej`                  dejb                  ejh                  z        Z5 ej`                  dejb                        Z6ddZ7	 d 	 	 	 	 	 	 	 	 	 d!dZ8d"dZ9	 	 	 	 	 	 	 	 	 	 d#dZ:d$dZ;d%dZ< G d d      Z=y# e#$ r dZ"Y w xY w# e#$ r dZ%Y w xY w# e#$ r dZ'Y w xY w)&u  
async_url_seeder.py
Fast async URL discovery for Crawl4AI

Features
--------
* Common-Crawl streaming via httpx.AsyncClient (HTTP/2, keep-alive)
* robots.txt → sitemap chain (.gz + nested indexes) via async httpx
* Per-domain CDX result cache on disk (~/.crawl4ai/<index>_<domain>_<hash>.jsonl)
* Optional HEAD-only liveness check
* Optional partial <head> download + meta parsing
* Global hits-per-second rate-limit via asyncio.Semaphore
* Concurrency in the thousands — fine on a single event-loop
    )annotationsN)datetime	timedeltatimezone)Path)AnyDictIterableListOptionalSequenceUnion)quoteurljoin)html)etreeTF   )AsyncLoggerBaseAsyncLogger)TYPE_CHECKINGSeedingConfigz+https://index.commoncrawl.org/collinfo.json   )dayszu<meta\s+(?:[^>]*?(?:name|property|http-equiv)\s*=\s*["\']?([^"\' >]+)[^>]*?content\s*=\s*["\']?([^"\' >]+)[^>]*?)\/?>z&<meta\s+[^>]*charset=["\']?([^"\' >]+)z<title>(.*?)</title>z=<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)c                    	 t         r3t        j                  |       }|j                  d      }|rt	        |      S y# t
        $ r Y yw xY w)z1Extract the most recent lastmod from sitemap XML.z"//*[local-name()='lastmod']/text()N)LXMLr   
fromstringxpathmax	Exception)xml_contentrootlastmodss      U/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/async_url_seeder.py_parse_sitemap_lastmodr%   Q   sR    	##K0Dzz"FGH8}$   s   8< 	AAc                J   | j                         sy	 t        | d      5 }t        j                  |      }ddd       j	                  d      dk7  ry|dkD  ret        j                  |d   j                  dd	            }t        j                  t        j                        |z
  j                         d
z  }||kD  ry|r|r|j	                  d      }|r||kD  ry|j	                  dd      dk(  ryy# 1 sw Y   xY w# t        j                  t        t        t        f$ r Y yw xY w)z
    Check if sitemap cache is still valid.

    Returns False (invalid) if:
    - File doesn't exist
    - File is corrupted/unreadable
    - TTL expired (if ttl_hours > 0)
    - Sitemap lastmod is newer than cache (if validate_lastmod=True)
    FrNversionr   r   
created_atZz+00:00i  sitemap_lastmod	url_countT)existsopenjsonloadgetr   fromisoformatreplacenowr   utctotal_secondsJSONDecodeErrorKeyError
ValueErrorIOError)	
cache_path	ttl_hoursvalidate_lastmodcurrent_lastmodfdatar)   	age_hourscached_lastmods	            r$   _is_cache_validrC   `   s    *c" 	 a99Q<D	  88I!# q=!//\0B0J0J3PX0YZJ!hll3j@OOQTXXI9$ !XX&78N/N"B 88K#q(1	  	 4   (J@ s:   C< C0C< A)C< <C< C< 0C95C< <#D"!D"c                    	 t        | d      5 }t        j                  |      }ddd       j                  dg       S # 1 sw Y   xY w# t        $ r g cY S w xY w)z7Read URLs from cache file. Returns empty list on error.r'   Nurls)r.   r/   r0   r1   r    )r;   r?   r@   s      r$   _read_cacherF      s[    *c" 	 a99Q<D	 xx##	  	   	s%   A
 >A
 AA
 
AAc                   dt        j                  t        j                        j	                         ||t        |      |d}	 t        | d      5 }t        j                  ||       ddd       y# 1 sw Y   yxY w# t        $ r Y yw xY w)z"Write URLs to cache with metadata.r   )r(   r)   r+   sitemap_urlr,   rE   wN)
r   r4   r   r5   	isoformatlenr.   r/   dumpr    )r;   rE   rH   r+   r@   r?   s         r$   _write_cacherM      s|     ll8<<0::<*"YD*c" 	aIIdA	 	 	 s0   A; A/&A; /A84A; 8A; ;	BBc                    t        j                   | |      ry| j                  dd      d   }t        j                   ||      xs, |j                  d      xr t        j                   |dd  |      S )NT://r   www.   )fnmatchsplit
startswith)urlpatterncanons      r$   _matchrY      sf    sG$IIeQ#EOOE7+ R  (PW__U12Y-PS    c                b	   t         r	 t        | t              r| j                  dd      } t	        j
                  |       }|j                  d      -|j                  d      j                  xs dj                         nd d i i g d}|j                  d      D ]  }|j                  j                  d      xs8 |j                  j                  d      xs |j                  j                  d	      }|r1|j                  j                  d
d      |d   |j                         <   d|j                  v s|j                  d   j                         |d<    |j                  d      D ]  }|j                  j                  dd      }|s"|j                         j!                         }dD ci c]   }||j                  v r||j                  |   " }}|D ]&  }	|d   j#                  |	g       j%                  |       (  |j                  d      D ]Q  }
|
j                  s	 t'        j(                  |
j                  j                               }|d   j%                  |       S |j                  d      }||j                  j                  dd      |d<   |S d d i i g dd}t,        j/                  |       }|r|j1                  d      j                         nd |d<   t2        j5                  |       D ]  \  }}||d   |j                         <    t6        j/                  |       }|r|j1                  d      j                         nd |d<   t8        j5                  |       D ]9  \  }	}|d   j#                  |	j                         g       j%                  d|i       ; t;        j<                  dt:        j>                  t:        j@                  z        }|j5                  |       D ]:  }	 t'        j(                  |j                               }|d   j%                  |       < t;        j.                  d| t:        j>                        }|r|j1                  d      |d<   |S # t        t        j                  f$ r i cY S w xY wc c}w # t&        j*                  $ r Y w xY w# t&        j*                  $ r Y w xY w)Nutf-8r3   z.//title )titlecharsetmetalinkjsonldz.//metanamepropertyz
http-equivcontentr`   r_   z.//linkrel)hrefastypehreflangra   z&.//script[@type="application/ld+json"]rb   z.//htmllang)r^   r_   r`   ra   rb   rk   r   r^   rg   zC<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>z <html[^>]*lang=["\']?([^"\' >]+))!r   
isinstancestrencode	lxml_htmlr   r9   r   ParserErrorfindtextstripr   attribr1   lowerrT   
setdefaultappendr/   loadsr7   	_title_rxsearchgroup_meta_rxfindall_charset_rx_link_rxrecompileIS)srcdocinfoelkrel_attr
rel_valuesaentryrf   scriptjsonld_data	html_elemmvrg   jsonld_patternmatch
lang_matchs                      r$   _parse_headr      sI   	#s#jj)4&&s+C
 xx
#/ hhz*//52<<>59b	 
 ))I& 	?B		f% ;* ;!yy}}\: *,))--	2*FVQWWY'bii'"$))I"6"<"<">Y	? ))I& 
	?Byy}}UB/H!)//1J/2 E56"))^ 		!_ EE E " ?V''R077>?
	? ii HI 	F{{"&**V[[->->-@"AKN))+6		 HHY'	 $++//;DL%)d$&b"NDA*+AGGAJ$$&DM  % $1"#VQWWY$3A,-aggaj&&(4DO%%c* H	TV		R077GH ZZNPRPTPTWYW[W[P[]N'', 	**U[[]3KN!!+. >RTTJJ!''*VK E--. 	I	*E ++ 2 ## 		s<   7Q (%Q9AQ>"7RQ65Q6>RRR.-R.c                  b   e Zd ZdZeddddf	 	 	 	 	 	 	 	 	 ddZdddZddZd dZd!dZ		 	 	 	 	 	 d"dZ
	 	 	 	 	 	 d#d	Z	 	 	 d$	 	 	 	 	 	 	 	 	 d%d
Zd&dZd'dZd(dZd)d(dZd*dZd+dZ	 	 	 d,	 	 	 	 	 	 	 	 	 	 	 	 	 d-dZd.dZ	 	 	 d/	 	 	 	 	 	 	 	 	 d0dZd1dZd2dZd3dZd4dZd Zd Zd Zd5dZy)6AsyncUrlSeedera  
    Async version of UrlSeeder.
    Call pattern is await/async for / async with.

    Public coroutines
    -----------------
    await seed.urls(...)
        returns List[Dict[str,Any]]  (url, status, head_data)
    await seed.many_urls(...)
        returns Dict[str, List[Dict[str,Any]]]
    await seed.close()
        closes the HTTP client if owned by seeder
    
    Usage examples
    --------------
    # Manual cleanup:
    seeder = AsyncUrlSeeder()
    try:
        urls = await seeder.urls("example.com", config)
    finally:
        await seeder.close()
    
    # Using async context manager (recommended):
    async with AsyncUrlSeeder() as seeder:
        urls = await seeder.urls("example.com", config)
    
    # Reusing existing client:
    client = httpx.AsyncClient()
    seeder = AsyncUrlSeeder(client=client)
    urls = await seeder.urls("example.com", config)
    # No need to close seeder, as it doesn't own the client
    Nc                   || _         |d u | _        |xs t        j                  ddddi      | _        || _        t        j                  |xs( t        j                  dt        j                                     | _        | j                  dz  dz  | _        | j                  j                  dd	       | j                  d
z  | _        d | _        d | _        t        t        j"                  j%                  |xs d            | _        | j&                  dz  j                  dd	       | j&                  dz  j                  d       y )NT   z
User-AgentzpMozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36)http2timeoutheadersCRAWL4_AI_BASE_DIRECTORYz	.crawl4aiseeder_cache)parentsexist_okzlatest_cc_index.txtz~/.cache/url_seederlivehead)r   )ttl_owns_clienthttpxAsyncClientclientloggerpathlibr   osgetenvhomebase_directory	cache_dirmkdirindex_cache_pathindex_id	_rate_sempath
expanduser
cache_root)selfr   r   r   r   r   s         r$   __init__zAsyncUrlSeeder.__init__'  s-    "dN  1 1b  MS
 ! %ll> ,5RYY&		>5 6,,{:TD9 $!!" (,6: rww11// 1 2	6	!(((E	6	!(($(7rZ   c                    | j                   r6t        | j                   |d      }|r ||||j                  di              yyy)z?Helper to log messages using the provided logger, if available.Nparams)messagetagr   )r   getattrr1   )r   levelr   r   kwargs
log_methods         r$   _logzAsyncUrlSeeder._logH  sB    ;; eT:J7"(**Xr":<  rZ   c                    t        j                  |j                               j                         }| j                  |z  | dz  S )N.json)hashlibsha1rn   	hexdigestr   )r   kindrV   hs       r$   _cache_pathzAsyncUrlSeeder._cache_pathS  s9    LL&002%1#U33rZ   c                  K   | j                  ||      }|j                         sy t        j                         |j                         j                  z
  | j
                  j                         kD  ry 	 t        j                  |d      4 d {   }t        j                  |j                          d {         cd d d       d {    S 7 @7 7 	# 1 d {  7  sw Y   y xY w# t        $ r Y y w xY ww)Nr'   )r   r-   timestatst_mtimer   r6   aiofilesr.   r/   rx   readr    )r   r   rV   pr?   s        r$   
_cache_getzAsyncUrlSeeder._cache_getW  s     T3'xxz99;qvvx(((488+A+A+CC	}}Q, 2 2zz.12 2 2"02 2 2 2 		s   A/C72C( CC( #C3C
4C;C( CC( C7C( CC( C%CC%!C( $C7%C( (	C41C73C44C7c                <  K   	 t        j                  | j                  ||      d      4 d {   }|j                  t	        j
                  |d             d {    d d d       d {    y 7 C7 7 	# 1 d {  7  sw Y   y xY w# t        $ r Y y w xY ww)NrI   ),:)
separators)r   r.   r   writer/   dumpsr    )r   r   rV   r@   r?   s        r$   
_cache_setzAsyncUrlSeeder._cache_setc  s     	}}T%5%5dC%@#F G G!ggdjj*EFFFG G GFG G G G 		s   B*B A2B *A8A4A8!B ,A6-B 1B2B 4A86B 8B
>B?B
B 	B
B 	BBBBc                    !K   |j                   xs d|j                  }|j                  |j                  |j                  }d|j
                  }|j                   _        |j                  |j                  |j                  n# j                  r j                  j                  nd!|j                  |j                  nd|j                  |j                  |j                  t        |dd       _        t        |dd	       _         j                  r=t!         j                  d
      r'|j                  |j                   j                  _         j"                   j%                          d{    _        |j'                  d      ddh}D ]&  }||vst)        d| ddj+                  |              |r<|dk  r j-                  ddd       d _        n"t1        j2                  |       _        nd _         j-                  dd|dd        fdt5        dt7        d|dz              }t1        j8                  |      t1        j:                         t1        j:                          t=               |j>                    fd}	d,  !fd }
g }t1        j@                   |	             }tC        |      D cg c]  }t1        j@                   |
|             }}t1        jD                  |g|  d{    j+                          d{     j-                  dd!tG        |      d"d       rrd#k(  r jI                  ||       d{   }btG        |      }|D cg c]  }|jK                  d$d      k\  s| }}|tG        |      kD  r$ j-                  dd%|tG        |      z
  d&d       |jM                  d' d	(        j-                  dd)tG        |      d*d       nrs j-                  dd+d       dkD  r|d S |S 7 c c}w 7 '7 7 c c}w w)-a.  
        Fetch URLs for a domain using configuration from SeedingConfig.

        Parameters
        ----------
        domain : str
            The domain to fetch URLs for (e.g., "example.com")
        config : SeedingConfig
            Configuration object containing all seeding parameters
        *   NFrP   cache_ttl_hours   validate_sitemap_lastmodTverbose+ccsitemapzInvalid source 'z'. Valid sources are: z, r   warning7hits_per_sec must be positive. Disabling rate limiting.URL_SEEDr   r   z6Starting URL seeding for {domain} with source={source})domainsourcer   r   c                  K   dv r6j                  ddd       j                        2 3 d {   } |  dv r7j                  ddd       j                        2 3 d {   } |  y y 7 F6 @7 6 
w)Nr   debugzFetching from sitemaps...r   r   r   zFetching from Common Crawl...)r   _from_sitemaps_from_cc)ur   forcerW   r   sourcess    r$   genz AsyncUrlSeeder.urls.<locals>.gen  s     G#		'#>J	O#2267EJ  !Gw		'#B(  *#}}VWeD  !G	 J
DsI   ,BA<A:A<2B)B -A>.B 1	B:A<<B>B  B'    d   maxsizec            	       K   	         2 3 d {   } | v rj                  ddd| id       %j                         rj                  ddd        n+j                  |        j                  |        d {    vj                          j                  ddd       y 7 7 ,6 +# t        $ r*}j                  d	d
d	t        |      id       Y d }~Yd }~ww xY w# j                          j                  ddd       w xY ww)Nr   Skipping duplicate URL: {url}rV   r   r   r   z(Producer stopping due to max_urls limit.r   errorz&Producer encountered an error: {error}zProducer finished.)r   is_setaddputr    rm   set)r   er   producer_donequeueseenr   
stop_events     r$   producerz%AsyncUrlSeeder.urls.<locals>.producer  s     I"u 
' 
'!Dy		'+J*/ " E !((*		"$NT^ " `HHQK))A,&&
 !!#		'#7Z	H!
' '  %  <		'#K!3q6U+0:  < << !!#		'#7Z	Hsm   DB- B+B'B+A(B- <B)=B- %D'B+)B- +B- -	C 6 CC# C  C# #&D		Dc                  K   	 
j                         rj                         ry 	 t        j                  
j	                         d       d {   }dkD  rt        |       k\  rwj                  ddd	       j                          
j                          
j                         s2	 
j                          
j                          
j                         s2y j                  rHj                  4 d {    j                  || 	
       d {    d d d       d {    n"j                  || 	
       d {    
j                          `7 # t        j
                  $ r Y zt        $ r+}j                  dddt        |      id       Y d }~d }~ww xY w# t        j                  $ r Y y w xY w7 7 7 # 1 d {  7  sw Y   xY w7 w)
Nr   r   z,Worker failed to get URL from queue: {error}r   r   r   r   z&Worker stopping due to max_urls limit.r   )emptyr   asynciowait_forr1   TimeoutErrorr    r   rm   rK   r   	task_done
get_nowait
QueueEmptyr   	_validate)res_listrV   r   extract_headfilter_nonsensehead_timeout
live_checkmax_urlsr   queryr   score_thresholdscoring_methodr   r  r   s      r$   workerz#AsyncUrlSeeder.urls.<locals>.worker  s    ;;=]%9%9%; ' 0 0a @@C a<CMX$=II@&  
 NN$ OO% $kkm"!,,.!OO- $kkm >>#~~ > >"nnS(J-97E?\j-<> > >> > >
 ..h
L)5wXf)8: : : !Y  A++   IIg'U%s1v_/4>  @,  '11 "!"
>>> > > >
:s   #G*'E( E%E( AG*+ F4 G*G*9G:G*=GGG G*+G,"G*G(G*%E( (F1;G*>F1 F,&G*,F11G*4G
G*	G

G*GG*G%GG%!G*z6Finished URL seeding for {domain}. Total URLs: {count})r   countbm25relevance_scorez:Filtered {filtered} URLs below score threshold {threshold})filtered	thresholdc                &    | j                  dd      S )Nr          r1   xs    r$   <lambda>z%AsyncUrlSeeder.urls.<locals>.<lambda>  s    quu->'D rZ   keyreversez;Sorted {count} URLs by relevance score for query: '{query}')r  r  zTQuery provided but extract_head is False. Enable extract_head for relevance scoring.r  List[Dict[str, Any]])'rW   r   r  r  concurrencyhits_per_secr   r   r   r  r  r  r  r   _cache_ttl_hours_validate_sitemap_lastmodhasattrr   _latest_indexrT   r9   joinr   r   r  	Semaphoreminr   QueueEventr   filter_nonsense_urlscreate_taskrangegatherrK   _apply_bm25_scoringr1   sort)"r   r   configr   r&  r'  valid_sourcess
queue_sizer  r  results	prod_task_workersoriginal_countr'   r  r  r   r   r  r  r  rW   r   r  r   r  r  r   r   r  r   s"   ``               @@@@@@@@@@@@@@@@@r$   rE   zAsyncUrlSeeder.urlsl  s7     ..'C&&
**((**\\
$*NN$>&..#';;DKKE 	&,oo&A6??r 00.. !(0A2 F)09SUY)Z& ;;74;;	:v~~?Y"(..DKK == "&"4"4"66DM ,,s#y) 	\A% &qc)?		-@X?YZ\ \	\
 q 		X^h  j!%!(!2!2<!@!DN		&R$*f=: 	 	O		 		 D+*; <=
j1]]_
 55	I 	I(-	" -	" -	"` )+''
3	!+.0 &&vg7 0 0 nnY1111jjl		&R$*S\B
 	 	T \n&> 44WfEEG *!$W&-`7H!1LP_1_1``!CL0IIf&b2@3w<2O^m%nt~  @ LLDdLSIIf['*7|eD*  V<IIq  xB  C &.\wy!>w>a 7b0 	2 F
 asu   E2QQ'Q0D9Q)"QQ&Q'Q>Q?AQQQQ3Q7BQQQQQc                  K   | j                  dddt        |      id       | j                  r=t        | j                  d      r'|j                  |j                  | j                  _        |D cg c]  }| j                  ||       }}t        j                  |  d{   }t        t        ||            }| j                  ddd	       |S c c}w 7 4w)
a>  
        Fetch URLs for many domains in parallel.

        Parameters
        ----------
        domains : Sequence[str]
            List of domains to fetch URLs for
        config : SeedingConfig
            Configuration object containing all seeding parameters

        Returns a {domain: urls-list} dict.
        r   z+Starting URL seeding for {count} domains...r  r   r   r   Nz*Finished URL seeding for multiple domains.r   )
r   rK   r   r*  r   rE   r  r4  dictzip)r   domainsr7  r   tasksr;  final_resultss          r$   	many_urlszAsyncUrlSeeder.many_urls'  s     " 			&G!3w<0j 	 	B ;;74;;	:v~~?Y"(..DKK "
 IIff%
 
  ..S'23		@j 	 	R
 /s   A.C0C	C C!5Cc                   K   ddl m}  |d|d      |_        d_         j	                  ddd	t              id
       j                  rPj                  dk  r j	                  ddd
       d _        n,t        j                  j                         _        nd _        t        dt        d|dz              }t        j                  |      t        j                         t        j                         t               g } fd}d  fd}	t        j                   |             }
g }t!        |      D ].  }t        j                   |	|            }|j#                  |       0 |
 d{    j%                          d{    |D ]  }|j'                           t        j(                  |ddi d{    j*                  r)j,                  dk(  r j/                  |       d{   }j0                  /|D cg c]$  }|j3                  dd      j0                  k\  s#|& }}t5        d |D              r|j7                  d d        j	                  ddt              t        |D cg c]  }|j3                  d      dk(  s| c}      dd
       |S 7 67 !7 7 c c}w c c}w w)!au  
        Extract head content for a custom list of URLs using URLSeeder's parallel processing.
        
        This method reuses URLSeeder's efficient parallel processing, caching, and head extraction
        logic to process a custom list of URLs rather than discovering URLs from sources.
        
        Parameters
        ----------
        urls : List[str]
            List of URLs to extract head content from
        config : SeedingConfig, optional
            Configuration object. If None, uses default settings for head extraction
        concurrency : int, default=10
            Number of concurrent requests
        timeout : int, default=5
            Timeout for each request in seconds
            
        Returns
        -------
        List[Dict[str, Any]]
            List of dictionaries containing url, status, head_data, and optional relevance_score
        Nr   r   TF)r  r&  r   r   z0Starting head extraction for {count} custom URLsr  r   r   r   r   r   r   r   r   r   r   c                   K   	 D ]Z  } | v rj                  ddd| id       j                         r n,j                  |        j                  |        d{    \ j	                          y7 # j	                          w xY ww)z%Producer to feed URLs into the queue.r   r   rV   r   r   N)r   r   r   r   r   )rV   r   r   r   r   r  rE   s    r$   r  z6AsyncUrlSeeder.extract_head_for_urls.<locals>.producer  s     $ )Cd{		'+J*/: " G !((*HHSM))C.(() !!# )!!#s/   BAA7 A5A7 $B5A7 7B		Bc                  K   	 	 t        j                  j                         d       d{   }	 j                  || ddj                  xs dj                  j                  j                  xs dj                  
       d{    j!                          7 s# t         j                  $ r% j	                         rj                         rY yY w xY w7 R# t        $ rJ}j                  dd	|t        |      d
d       | j                  |di t        |      d       Y d}~d}~ww xY w# j!                          w xY ww)z&Worker to process URLs from the queue.T      ?r   NFr  )r   extractr   r   r  r  r  r  r   z$Failed to process URL {url}: {error}rV   r   r   r   failed)rV   status	head_datar   )r  r  r1   r  r   r  r  r   r  r  r  r1  r    r   rm   rw   r  )r  rV   r   r7  r   r   r   r   s      r$   r  z4AsyncUrlSeeder.extract_head_for_urls.<locals>.worker  sB     ' 0 0c JJC&..X" $ ' & 7%$ll(.(>(>'-'<'<'F(.(C(C ) 
 
 
, OO%C  K++ $++-%++-
 ! 	IIg'M-03q6%B
  T OO""*%'!$Q	%  		 OO%s{   E(B$ B"B$ AC! CC! E"B$ $3CECEC! !	D4*A D/*D7 /D44D7 7E		Ereturn_exceptionsr  r  c              3  $   K   | ]  }d |v  
 yw)r  N ).0r'   s     r$   	<genexpr>z7AsyncUrlSeeder.extract_head_for_urls.<locals>.<genexpr>  s     7! A%7s   c                &    | j                  dd      S )Nr  r   r  r  s    r$   r   z6AsyncUrlSeeder.extract_head_for_urls.<locals>.<lambda>  s    quu->'B rZ   r!  z@Completed head extraction for {count} URLs, {success} successfulrO  valid)r  successr$  )async_configsr   r&  r  r   rK   r'  r   r  r-  r.  r   r/  r0  r   r2  r3  rw   r,  cancelr4  r  r  r5  r  r1   anyr6  )r   rE   r7  r&  r   r   r:  r;  r  r  producer_taskworker_tasksr=  worker_tasktaskr'   r   r   r   r  s   ``` `           @@@@r$   extract_head_for_urlsz$AsyncUrlSeeder.extract_head_for_urlsJ  s    < >4"!'F )"		&L!3t9-: 	 	? ""a'		)%^dn	o!%!(!2!263F3F!G!DN D+*; <=
j1]]_
 )+	$ 	$#	& #	&L  ++HJ7 {# 	-A!--fWo>K,	-
  jjl ! 	DKKM	 nnlCdCCC <<F11V; 44WfEEG !!-")cQQUU3Da-HFLbLb-bqcGc 7w77LLBDLQ		&\"4y!$%WAAEE(Ow<Va%W!X $	 	 	% = 	 	 	D F d &Xss   E9KJ9KJ<3KJ?4KKK$K>KAK
K$K(K<K?KK
Kc                  K   t         s| j                  ddd       |S g }g }|D ]  }|j                  d      dk(  rx|j                  d      rg| j                  |d         }|r#|j	                  |       |j	                  |       a| j                  |j                  |d         }t        |      |d	<   |j                  d      dk(  s| j                  |j                  |d         }t        |      |d	<    |rh|rft        j                  | j                  |j                  |       d
{   }t        |      D ]%  \  }	}|	t        |      k  st        ||	         |d	<   ' |S 7 9w)z2Apply BM25 scoring to results that have head_data.r   z2BM25 scoring requested but rank_bm25 not availabler   r   rO  rW  rP  rV   r  N)HAS_BM25r   r1   _extract_text_contextrw   _calculate_url_relevance_scorer  floatr  	to_thread_calculate_bm25_score	enumeraterK   )
r   r;  r7  text_contextsvalid_resultsresulttext_contextscorescoresis
             r$   r5  z"AsyncUrlSeeder._apply_bm25_scoring  sc    IIi!U[eIfN  	9Fzz(#w.6::k3J#99&:MN!((6!((0 !??fUZm\E05eF,-H%0;;FLL&QV-X,1%L()	9  ]",,T-G-GWdeeF&}5 A	6s6{?05fQi0@F,-A  fs   CE&	A!E&*E$+#E&E&c           	       K   	 | j                   j                  |dd       d{   }d|j                  cxk  rdk  rn nt        |j                        S |j                  dv r)|j
                  j                  d      }|rt        ||      S y7 i# t        $ r+}| j                  d	d
|t        |      dd       Y d}~yd}~ww xY ww)z
        HEAD-probe a URL.

        Returns:
            * the same URL if it answers 2xx,
            * the absolute redirect target if it answers 3xx,
            * None on any other status or network error.
        
   Fr   follow_redirectsN   ,  i-  i.  i/  i3  i4  locationr   zHEAD {url} failed: {err})rV   errr   r   )
r   r   status_coderm   rV   r   r1   r   r    r   )r   rV   r'   locr   s        r$   _resolve_headzAsyncUrlSeeder._resolve_head  s     	kk&&sB&OOA amm)c)155z! }} 99iimmJ/"3,, P  	IIg9%(Q8j  J	sJ   C!B B0B C6B CB 	C!C ;C CCc           
    F  K   dd l }t        j                  |j                               j	                         d d } |j
                  dd|      j                  dd      d   j                  dd      d   j                  d      } |j
                  d	d
|      }| j                  | j                   d
| d
| dz  }|j                         rg|se| j                  dd||dd       t        j                  |d      4 d {   }	|	2 3 d {   }
|
j                         }t        ||      s&| ,d|v rd| dnd| d}d| j                   dt!        |d       d}d}| j                  dd||dd       t#        |dz         D ]  \  }}	 | j$                  j'                  d|      4 d {   }|j)                          t        j                  |d      4 d {   }	|j+                         2 3 d {   }
t-        j.                  |
      }|d    }|	j1                  |d!z          d {    t        ||      sL| R y 7 H7 @6 d d d       d {  7   y # 1 d {  7  sw Y   y xY w7 7 7 7 M6 d d d       d {  7   n# 1 d {  7  sw Y   nxY wd d d       d {  7    y # 1 d {  7  sw Y    y xY w# t2        j4                  $ r}|j6                  j8                  d"k(  rP|t;        |      k  rB| j                  d#d$|||   d%d       t=        j>                  ||          d {  7   Y d }~| j                  d&d'|tA        |      d(d        d }~wtB        $ r'}| j                  d&d)|tA        |      d(d        d }~ww xY ww)*Nr      
^https?://r]   #r   ?.[/?#]+r=  .jsonlr   z/Loading CC URLs for {domain} from cache: {path})r   r   r   r   r'   /z*.r   z/*zhttps://index.commoncrawl.org/z-index?url=)safez&output=json)r      r   z<Fetching CC URLs for {domain} from Common Crawl index: {url})r   rV   )rP   GETrI   rV   
i  r   zACommon Crawl API returned 503 for {domain}. Retrying in {delay}s.)r   delayr   z2HTTP error fetching CC index for {domain}: {error})r   r   z-Error fetching CC index for {domain}: {error})"r   r   md5rn   r   subrT   lstripr   r   r-   r   r   r.   rs   rY   r   rh  r   streamraise_for_statusaiter_linesr/   rx   r   r   HTTPStatusErrorresponsery  rK   r  sleeprm   r    )r   r   rW   r   r   digestrawr  r   fplinerV   globretriesro  dr'   recr   r   s                       r$   r   zAsyncUrlSeeder._from_cc*  s    W^^-.88:2A> bff]B/55c6799:<<AE#qM!MMSVTW[ 	 rvvhS)~~4==/4&& HH;;=IIfO(.=:  O}}T3/ " "2"$ " "$**,Cc7+!	 "SjC5{3%rl.t}}o[tZ]I^H__kl		&X$*37Z 	 	Igem, 	DAq;;--eS9 ( (Q&&('}}T37 ( (2*+--/ ( ($"&**T"2C #E
A"$((1T6"222%a1&'	"""" " "
 " " "
 ((( 3 +:( ( ( ( (( ( ( ( ( ( (( ::))S0QW5EIIi)l06%LR\  ^!--
333		'#W,2SV!D*  V 		'#R,2SV!D*  Vs  C<N!>I?N!II	I	
II+I1AN! K.I4/K2+J7I6
J7!J2I<6I87I<:2J,I:-J>JN!	IIN!IN!I1%I(&I1-N!4K6J78I<:J<J=J7J	J7J!JJ!J7$K/J20K5N!7K
	=K >K
	KN!
KN AM+=M >M+N!	"M++N7"NNN!c           	    	  K   t        | dd      }t        | dd      }t        j                  dd|      j                  d      }t        j                  dd	|      }t	        j
                  |j                               j                         d
d }| j                  d| d	| dz  }	| j                  d| d	| dz  }
|
j                         r1	 |
j                          | j                  dddt        |
      id       d
}d
}d
}d}|D ]  }dD ]  }| d| | }| j                  |       d
{   }|s(|}	 | j                  j!                  |dd       d
{   }d|j"                  cxk  rdk  rn n|j$                  }t'        |      } n |s n |sz|	j                         rjt)        |	|||      rE| j                  dddt        |	      id       t+        |	      }|D ]  }t-        ||      s|  y
| j                  ddd|id       g }|rX|rV| j                  ddd|id       | j/                  ||      2 3 d
{   }|j1                  |       t-        ||      s'| -|rU| j                  ddd|id       | j3                  |      2 3 d
{   }|j1                  |       t-        ||      s'| -d | d!}	 | j                  j!                  |d"d       d
{   }d|j"                  cxk  rdk  rn n|j4                  j7                         D cg c]D  }|j9                         j;                  d#      r#|j=                  d$d%      d%   j?                         F }}|D ]?  }| j3                  |      2 3 d
{   }|j1                  |       t-        ||      s'| - n#| j                  d&d'||j"                  d(d       y
	 |r4tA        |	||xs d|       | j                  dd+tC        |      |d,d       y
y
# t        $ r Y w xY w7 7 # t        $ r Y w xY w7 6 b7 6 g7 Tc c}w 7 6 # t        $ r+}| j                  d&d)|t        |      d*d       Y d
}~y
d
}~ww xY ww)-a,  
        Discover URLs from sitemaps with smart TTL-based caching.

        1. Check cache validity (TTL + lastmod)
        2. If valid, yield from cache
        3. If invalid or force=True, fetch fresh and update cache
        4. FALLBACK: If anything fails, bypass cache and fetch directly
        r(  r   r)  Tr~  r]   r  r  r=  Nr}  sitemap_r   r  r   zDeleted old cache format: {p}r   r   r   )httpshttp)/sitemap.xml/sitemap_index.xmlrO      rr  rt  ru  z*Loading sitemap URLs from valid cache: {p}z1Cache invalid/expired, refetching sitemap for {d}r  zFound sitemap at {url}rV   zhttps:///robots.txtrq  zsitemap:r   r   r   z&robots.txt unavailable for {d} HTTP{c})r  cz'Failed to fetch robots.txt for {d}: {e})r  r   zCached {count} URLs for {d})r  r  )"r   r   r  rstripr   r  rn   r   r   r-   unlinkr   rm   r    r{  r   r1   ry  re   r%   rC   rF   rY   _iter_sitemap_contentrw   _iter_sitemaprr   
splitlinesru   rU   rT   rs   rM   rK   )r   r   rW   r   r   r=   host	host_safer  r;   old_cache_pathrH   r+   sitemap_contentschemesschemesuffixsmresolvedr'   cached_urlsrV   discovered_urlsr   robotslsitemap_linesr   s                               r$   r   zAsyncUrlSeeder._from_sitemapsb  s     "$(:B?"4)DdK vvmR077<FF8S$/	W^^-.88:2A>^^1VHE&JJ
 HYKq*OO  "%%'		&"A"%s>':!;  M # 	F@ xs4&1!%!3!3B!77"*K"&++//+r\`/"aa!--5#5./iiO.D_.UO  !	& **,z?<Lo^		&"N"%s:!7Z  I)*5& "Cc7+!	" 		&"U"%vJ  @ ?IIf6{?SYcId  55k?S  a&&q)!W%GIIf6{?SYcId--k:  a&&q)!W%G  v[1F++//&"t/TT!---#-./ff.?.?.A%J)*()	(<(<Z(H &'WWS!_Q%7%=%=%? %JM %J , ('+'9'9"'= ( (!+2215%a1&'	( IIi)Q+1%FJ  X( _k6GRYIIf;'*?';&Iz  [ g   8
 b % 6S: U%J('=  		)%N'-CF!;  MsM  CR0P ;/R*P+R2R5!P$P!3P$
RAR0AR>P7P4P7R%0RP<P9P<R<R	!Q
 *P>+8Q
 #A	Q,Q
 QQ	QQ
 +*Q
 9R	PRPR!P$$	P1-R0P11R4P77R9P<<R>Q
 Q
 QQ
 
	Q>!Q94R9Q>>Rc           	       K   |j                  d      rt        j                  |      n|}|dfd}d}g }g }t        r	 t	        j
                  d      }t	        j                  ||      }	|	j                  d      }
|	j                  d      }|
r/d}|
D ](  } ||j                        }|s|j                  |       * |s-|D ](  } ||j                        }|s|j                  |       * nddlmc m} 	 |j                  |      }	|	j                         D ]4  }d|j                   v s|j                   j#                  d      d   |_        6 |	j%                  d      }|	j%                  d      }|rDd}|D ]=  }|j'                  d      } |||j                  nd      }|s-|j                  |       ? |sB|D ]=  }|j'                  d      } |||j                  nd      }|s-|j                  |       ? |r|r؉ j                  dddt)        |      id       t+        dt)        |      dz        }t-        j.                  |      d}t)        |      }d fd}|D cg c]  }t-        j0                   ||             }}||k  r+j3                          d{   }||dz  }n| ||k  r+t-        j4                  |ddi d{    y|D ]  }| 	 y# t        $ r+} j                  d	d
|t        |      dd       Y d}~yd}~ww xY w# t        $ r+} j                  d	d|t        |      dd       Y d}~yd}~ww xY wc c}w 7 7 w) z+Parse sitemap from already-fetched content..gzc                H    | sy t        | j                               }|sy |S Nr   rs   r  
normalizedbase_urls     r$   _normalize_locz<AsyncUrlSeeder._iter_sitemap_content.<locals>._normalize_loc  '     399;7JrZ   FTrecoverparser1//*[local-name()='sitemap']/*[local-name()='loc']-//*[local-name()='url']/*[local-name()='loc']r   -LXML parsing error for sitemap {url}: {error}rM  r   r   Nr   }r   
.//sitemap.//urlrz  4ElementTree parsing error for sitemap {url}: {error}r   z2Processing sitemap index with {count} sub-sitemapsr  P  r   r   c           	     d  K   	 j                  |       2 3 d {   }j                  |       d {    #7 7 6 n7# t        $ r+}j                  dd| t	        |      dd       Y d }~nd }~ww xY wj                  d        d {  7   y # j                  d        d {  7   w xY ww)Nr   +Error processing sub-sitemap {url}: {error}rM  r   r   )r  r   r    r   rm   rH   r   r   result_queuer   s      r$   process_subsitemapz@AsyncUrlSeeder._iter_sitemap_content.<locals>.process_subsitemap%  s     1#'#5#5k#B 2 2a*..q11121 $C  \IIg'T-83q6%JPZ  \ \\ '**4000,**4000sy   B0? =9=? ;? =? ? B 	A3!A.)B .A33B 6B0
BB0B-&B)'B--B0rQ  r  Optional[str]returnr  rH   rm   )endswithgzip
decompressr   r   	XMLParserr   r   rr   rw   r    r   rm   xml.etree.ElementTreeElementTreeiterr   rT   r}   rq   rK   r.  r  r/  r2  r1   r4  ) r   rV   re   r@   r  is_sitemap_indexsub_sitemapsregular_urlsr  r"   sitemap_loc_nodesurl_loc_nodessitemap_elemrz  loc_elemr   ETelemsitemapsurl_entriesr   url_elemr:  completed_counttotal_sitemapsr  r  rD  itemr   r  r  s    `                             @@r$   r  z$AsyncUrlSeeder._iter_sitemap_content  sr    +.<<+>tw'G	 !6''V<$(JJ/b$c! $

+Z [$'+$(9 5,\->->?(//45
 ($1 5,X]];(//45 /.}}T* IIK :Ddhh#'88>>##6q#9:  <<5"ll84'+$#+ 5#*<<#6,h>RX]]X\](//4	5 ($/ 5#+==#7,h>RX]]X\](//4	5 IIfR%s<'89z  K UC$5$<=J"==<LO .N1 LXXRW(();B)?@XEX!N2)--//<#q(OJ "N2 ..%@4@@@! C  		'#R),s1v!>J  P:  		'#Y),s1v!>J  P2 Y 0 As   =M"A,K+ 0.K+ K+ 2M">2L" 1A9L" +AL" /L" A*M","MM"(M)M" M"M M"+	L4!LM"LM""	M+!MM"M	M" M"c           	    
   K   	  j                   j                  |dd       d {   }|j                          |j                  d      rt        j                  |j                        n|j                  }t        |j                        d*fd}d}g }g }t         r	 t#        j$                  d      }	t#        j&                  ||	      }
|
j)                  d      }|
j)                  d      } j                  dd|t+        |      t+        |      dd       |r/d}|D ](  } ||j,                        }|s|j/                  |       * |sF|D ](  } ||j,                        }|s|j/                  |       * |s j                  ddd|id       nRdd lmc m} 	 |j'                  |      }
|
j5                         D ]4  }d|j6                  v s|j6                  j9                  d      d   |_        6 |
j;                  d      }|
j;                  d      } j                  dd|t+        |      t+        |      dd       |rDd}|D ]=  }|j=                  d       } |||j,                  nd       }|s-|j/                  |       ? |s[|D ]=  }|j=                  d       } |||j,                  nd       }|s-|j/                  |       ? |s j                  ddd|id       |r|r؉ j                  d"d#d$t+        |      id       t?        d%t+        |      d&z        }tA        jB                  |'      d}t+        |      }d+ fd(}|D cg c]  }tA        jD                   ||             }}||k  r+j                          d {   }||dz  }n| ||k  r+tA        jF                  |d)di d {    y |D ]  }| 	 y 7 # t        j                  $ r6} j                  dd||j                  j                  dd       Y d }~y d }~wt        j                  $ r+} j                  dd	|t        |      d
d       Y d }~y d }~wt        $ r+} j                  dd|t        |      d
d       Y d }~y d }~ww xY w# t        $ r+} j                  dd|t        |      d
d       Y d }~y d }~ww xY w# t        $ r+} j                  dd!|t        |      d
d       Y d }~y d }~ww xY wc c}w 7 s7 Fw),Nr  Trr  r   z1Failed to fetch sitemap {url}: HTTP {status_code}rV   ry  r   r   z-Network error fetching sitemap {url}: {error}rM  r   z0Unexpected error fetching sitemap {url}: {error}r  c                H    | sy t        | j                               }|sy |S r  r  r  s     r$   r  z4AsyncUrlSeeder._iter_sitemap.<locals>._normalize_locQ  r  rZ   Fr  r  r  r  r   zYParsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered)rV   sitemap_countr,   zvNo <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.rV   r  r   r  r   r  r  zeElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discoveredrz  r  r   z>Processing sitemap index with {count} sub-sitemaps in parallelr  r  r   r   c           	       K   	 j                  ddd| id       j                  |       2 3 d {   }j                  |       d {    #7 7 6 n7# t        $ r+}j                  dd| t	        |      dd       Y d }~nd }~ww xY wj                  d        d {  7   y # j                  d        d {  7   w xY ww)	Nr   zProcessing sub-sitemap: {url}rV   r   r   r   r  rM  )r   r  r   r    rm   r  s      r$   r  z8AsyncUrlSeeder._iter_sitemap.<locals>.process_subsitemap  s     1II!@%Q\I]cm  o $(#5#5k#B 2 2a*..q11121 $C  \IIg'T-83q6%JPZ  \ \\
 '**4000,**4000s   C(A AAAA 
AA AA A B( 	B
!B B( B

B( C!B$"C(C=C >CCrQ  r  r  )$r   r1   r  r   r  r   r  ry  RequestErrorrm   r    r  r  r  re   rV   r   r   r  r   r   rK   rr   rw   r  r  r  r   rT   r}   rq   r.  r  r/  r2  r4  ) r   rV   r'   r   r@   r  r  r  r  r  r"   r  r  r  rz  r  r  r  r  r  r   r  r:  r  r  r  r  rD  r  r   r  r  s    `                             @@r$   r  zAsyncUrlSeeder._iter_sitemap=  s0    	kkooc2oMMA  .1\\%-@tqyy)aiiquu:	 ! +6''V<$(JJ/b$c! $

+Z [		o"),->)?%(%7
 #  	 %'+$(9 5,\->->?(//45 ($1 5,X]];(//45 (		% U$)3< *	 "  /..}}T* IIK :Ddhh#'88>>##6q#9:
  <<5"ll84		{"),X%(%5
 #  	 '+$#+ 5#*<<#6,h>RX]]X\](//4	5 ($/ 5#+==#7,h>RX]]X\](//4	5
 (		% U$)3< *	 "  IIf^%s<'89z  K
 UC$5$<=J"==<LO .N1   ,- (();B)?@ -E - "N2)--//<#q(OJ "N2 ..%@4@@@ " S N$$ 	IIi!T%(9O9OPV`  b!! 	IIi!P%(3q6:
  L 	IIgQ%(3q6:
  L	~  		'#R),s1v!>J  Pb  		'#Y),s1v!>J  P@-
 0 As  T!O OO A#T!BR 9.R (,R T 2S B$S 8AS <,S (A*T"S<4TTT&T>T?TO R',PTR.!QTR !RTRT	S!R=8T=ST	S9!S4/T4S99	TTc                  K   |
r)| j                  |      r| j                  ddd|id       y |rdnd}t        | d      r| j                  s.| j	                  ||       d {   }|r|j                  |       y |r| j                  dd	d|id       | j                  ||       d {   \  }}}|rd
nd}| j                  |rdndd|j                         |xs |dd       |r"t        j                  t        |       d {   ni }|xs |||d}no|rg| j                  ddd|id       | j                  |       d {   }|rd
nd}| j                  |rdndd|j                         |dd       ||i d}n|di d}|s|r| j                  |||       d {    |j                  |       y 7 U7 7 7 z7  w)Nr   z Filtered out nonsense URL: {url}rV   r   r   r   r   r   zFetching head for {url}rW  	not_validr   r   zHEAD {status} for {final_url})rO  	final_url)rV   rO  rP  zPerforming live check for {url}zLIVE CHECK {status} for {url})rO  rV   unknown)_is_nonsense_urlr   r*  r   r   rw   _fetch_headupperr  rf  r   r{  r   )r   rV   r  r   rL  r   r   r  r  r  r  
cache_kindcachedokr   finalrO  rP  r   s                      r$   r  zAsyncUrlSeeder._validate  s     t44S9IIgA#S\z  ;&VF
 g&4::??:s;;F'IIg8SB"'1  3$($4$4S'$BBOBe "WFIIf	3R(.U\cRXb  d GIg//TBBBbI| &E IIg@SJ"'1  3))#..B "WFIIf	3R(.sC  U6CE  92FE 7//*c5999G < C
 C / :s]   AG!F6"AG(F9)AGF<>GF>AGG  G9G<G>G Gc           	       K   	 | j                   j                  ||ddd       d {   }|j                          y7 # t        j                  $ r+}| j                  dd|t        |      dd	
       Y d }~yd }~wt        j                  $ r6}| j                  dd||j                  j                  dd	
       Y d }~yd }~wt        $ r+}| j                  dd|t        |      dd	
       Y d }~yd }~ww xY ww)Nz	bytes=0-0identity)RangeAccept-Encoding)r   r   Tr   z+HEAD check network error for {url}: {error}rM  r   r   Fz5HEAD check HTTP status error for {url}: {status_code}r  r   z5Unexpected error during HEAD check for {url}: {error})r   r   r  r   r  r   rm   r  r  ry  r    )r   rV   r   r'   r   s        r$   _head_okzAsyncUrlSeeder._head_ok"  s     	kk&&sG9DYc/d ' f fA f !! 	IIgL%(3q6:
  L$$ 	IIgV%(9O9OPV`  b 	IIgV%(3q6:
  L	s_   C:$A  >A  C:A   C7!A94C:9C7,C ;C: C7!C2-C:2C77C:c                  K   t        |dz         D ]  }	 | j                  j                  d||ddid      4 d {   }|j                  dv r|j                  j                  d      }|r@t        ||      }| j                  d	d
|j                  |dd       	 d d d       d {    | j                  dd|j                  |j                  dd       ddt        |j                        fcd d d       d {    c S d|j                  cxk  rdk  sYn | j                  dd|j                  |j                  dd       ddt        |j                        fcd d d       d {    c S t               }	|j                  |      2 3 d {   }
|	j                  |
       |	j                         }d|v st        |	      |k\  s=|j                          d {     |j                  j                  dd      j                         }	 |dk(  r|	d d dk(  rt!        j"                  |	      }	nO|dk(  r$t$        r|	d d dk(  rt'        j"                  |	      }	n&|dv r"| j                  d	d||j                  dd       |	j                         j+                  d      }|d"k(  r7| j                  d	d#d$|j                  id       t        |	      d%k  r|	n|	d d% }n|	d |d&z    }	 |j-                  d'd(      }d,|t        |j                        fcd d d       d {    c S  | j                  dd.||d/d       dd|fS 7 7 7 I7 7 7 q6 q# t(        $ r7}| j                  dd |j                  |t        |      d!d       Y d }~d }~ww xY w# t(        $ rG}| j                  dd)|j                  t        |      d*d       |j-                  d+d(      }Y d }~d }~ww xY w7 # 1 d {  7  sw Y   nxY w# t.        j0                  $ r2}| j                  d	d-|t        |      d*d       dd|fcY d }~c S d }~ww xY ww)0Nr   r  r   r  F)r   r   rs  rv  Locationr   z,Redirecting from {original_url} to {new_url})original_urlnew_urlr   r   r   z>Redirect status {status_code} but no Location header for {url})ry  rV   r]   rt  i  z=Non-success status {status_code} when fetching head for {url}s   </head>zContent-Encodingr     s   brrR   s   l
>   r  r  z#Skipping bogus {encoding} for {url})encodingrV   z3Decompression error for {url} ({encoding}): {error})rV   r  r   rP   z.No </head> tag found in initial bytes of {url}rV   i (  r   r\   r3   z0Failed to decode head content for {url}: {error}rM  zlatin-1Tz+Fetch head network error for {url}: {error}z2Exceeded max redirects ({max_redirects}) for {url})max_redirectsrV   )r3  r   r  ry  r   r1   r   r   rV   rm   	bytearrayaiter_bytesextendru   rK   acloser  r  
HAS_BROTLIbrotlir    rq   decoder   r  )r   rV   r   r	  	max_bytes
chunk_sizer=  r'   rw  bufchunklowencr   idx
html_bytesr   s                    r$   r  zAsyncUrlSeeder._fetch_head5  s     }Q' _	&A^&  ;;--# *: &+ . 	 V2 V2 }}(AA#$99==#<#")#x"8C IIg/]>?eePS-TZd & f$#V2 V2 V2& !IIi1q=>]]STSXSX-Y_i & k $)"c!%%j#8-V2 V2 V24  1==636		)-l9:quu)U[e " g$b#aee*4;V2 V2 V2> $+C'(}}Z'@ " "e

5)!iik%,CI0E"#((*,,!))--(:B?EEGC&=S!W-C"&//#"6C D[ZCGGZ<Z"("3"3C"8C N2 II ' E47'F$.	 & " ))+**:6Cby		'+[*/Z " I -0Hu,<S#fu+
%(#a%[
	G)00)D  s155z1mV2 V2 V2	_	&D 			)Q+8E: 	 	Wb#~AV2 V2 V2 V2@" -	 (A* % 		%Q+,5503c!f$F * "  . % G		%N+,553q6#B *	 "   *00IFGYV2 V2 V2 V2 V2p %% &		'#P),s1v!>J  Pb#~%&s  Q
&PL>P AO+P&M'P+Q
,AO+/P;M<P Q
AO+P)M*P.Q
1O+MM

M4O+	O+M
/O+A2M?A#O+#N5O+PO)P!Q
>PPPP
MO+O+	N,NO+NO+	O&=O!O+!O&&O+)P+O=	1O42O=	9P Q
Q%Q:Q;Q
QQ
c                   g }|j                  d      r|j                  |d          |j                  di       }dD ](  }|j                  |      s|j                  ||          * dD ](  }|j                  |      s|j                  ||          * dD ](  }|j                  |      s|j                  ||          * dD ](  }|j                  |      s|j                  ||          * |j                  dg       D ]  }t        |t              sdD ]^  }||v st        ||   t              r|j                  ||          0t        ||   t
              sD|j                  d	 ||   D               ` d
|v s|t        |d
   t
              s|d
   D ]G  }t        |t              sdD ]/  }||v st        ||   t              s|j                  ||          1 I  dj                  t        d|            S )z9Extract all relevant text from head metadata for scoring.r^   r`   )descriptionkeywordsauthorsubjectsummaryabstract)zog:titlezog:descriptionzog:site_namezarticle:tag)ztwitter:titleztwitter:descriptionztwitter:image:alt)zdc.titlezdc.descriptionz
dc.subjectz
dc.creatorrb   )rc   headliner  r  r  c              3  6   K   | ]  }|rt        |        y wr  )rm   )rT  r  s     r$   rU  z7AsyncUrlSeeder._extract_text_context.<locals>.<genexpr>  s       .Q264 /2$i .Qs   z@graph)rc   r   r   N)	r1   rw   rl   rA  rm   listr  r,  filter)r   rP  
text_partsr`   r"  rb   fieldr  s           r$   rc  z$AsyncUrlSeeder._extract_text_context  s    
 ==!i01 }}VR(Z 	-Cxx}!!$s),	-
 Q 	-Cxx}!!$s),	-
 Q 	-Cxx}!!$s),	-
 N 	-Cxx}!!$s),	-
  mmHb1 	CF&$'X QE%fUmS9&--fUm<'ut<&-- .Q:@-.Q QQ v%*VH5Et*L &x 0 C%dD1)L C#(D=ZUS5Q$.$5$5d5k$BCC	C( xxtZ011rZ   c                   |j                         }|j                         }ddlm}  ||      }|j                  j	                  dd      }|j
                  j                  d      }|j                  d      }	|j                  d      D 
cg c]  }
|
s|
	 }}
|j                  }g }|rC|j                  d      D ]/  }d|v s|j                  dd	      \  }}|j                  ||g       1 |	|z   |z   }g }|j                         }|D ]>  }|j                         }||v r|j                  d
       )||v s.|j                  d       @ |D ]  }g }|D ]r  }|j                         }||v r,t        |      t        |      z  }|j                  d|z         C||v sHt        |      t        |      z  }|j                  d|z         t |s|j                  t        |              dd}dj                  |      j                         }t        |      dk\  r^t        |      dk\  rP ||      } ||      }|r>|r<t        ||z        }t        ||z        }|dkD  r||z  nd}|j                  d|z         |sy|j                  d       d} d}!t        |      D ]  \  }"}#d	|"d	z   z  }$| |#|$z  z  } |!|$z  }! |!dkD  r| |!z  nd}%t!        |%d
      S c c}
w )zFCalculate relevance score between query and URL using string matching.r   urlparserQ   r]   r  r  &=r   rJ  g?gffffff?g333333?r  c                `     t         fdt        t               z
  dz         D              S )Nc              3  .   K   | ]  }||z      y wr  rS  )rT  ro  nrr   s     r$   rU  zTAsyncUrlSeeder._calculate_url_relevance_score.<locals>.get_ngrams.<locals>.<genexpr>  s     AqtAac{As   r   )r   r3  rK   )rr   r.  s   ``r$   
get_ngramszAAsyncUrlSeeder._calculate_url_relevance_score.<locals>.get_ngrams  s%    AE#d)A+a-,@AAArZ   r"        ?r  T)r#  )r  )ru   urllib.parser)  netlocr3   r   rs   rT   r  r  rw   rK   r   r,  r6  rh  r.  )&r   r  rV   query_lower	url_lowerr)  parsedr   r   domain_partsr   
path_partsquery_paramsparam_partsparamr"  value	all_partsrn  query_tokenspart
part_lowertokentoken_scorescoverager/  url_textquery_ngrams
url_ngramsintersectionunionjaccardweighted_scoretotal_weightro  rm  weightfinal_scores&                                         r$   rd  z-AsyncUrlSeeder._calculate_url_relevance_score  s    kkmIIK	 	*#&&vr2{{  % ||C(!%C6AAa6
6 ||%++C0 5%<!&S!!4JC&&U|45 !:-;	 "((*  	#DJj(c"{*c"	# " 	1EL! 8!ZZ\
J&"5zC
O;H ''h75(":U;H ''h78 c,/0	1 	B 88I&,,.{q S]a%7%k2L#H-J
"<*#<=L:5627!),.cGm,  	D!!&) 	#HAu!a%[Fefn,NF"L	#
 8Da7Gn|3Q;$$O 7s   
KKc                   |j                         ddlm}  ||      }|j                  j                         j	                  d      rydv rj	                  d      ryg d}t        fd|D              ryj                  d	      }t        d
 |D              ryg d}t        fd|D              ryt        fddD              ryt        j                  d	            dk  rdvryy)z
        Check if URL is a utility/nonsense URL that shouldn't be crawled.
        Returns True if the URL should be filtered out.
        r   r(  )r  r  r  Tz/sitemap)z.xmlz.xml.gzz.txt)zads.txtz
humans.txtzsecurity.txtz.well-known/security.txtzcrossdomain.xmlzbrowserconfig.xmlzmanifest.jsonzapple-app-site-associationz&.well-known/apple-app-site-associationzfavicon.icozapple-touch-icon.pngzandroid-chrome-192x192.pngc              3  F   K   | ]  }j                  d |         yw)r  N)r  )rT  filer   s     r$   rU  z2AsyncUrlSeeder._is_nonsense_url.<locals>.<genexpr>L  s      CTt}}qZ(Cs   !r  c              3  D   K   | ]  }|s|j                  d         yw)r  N)rU   )rT  r>  s     r$   rU  z2AsyncUrlSeeder._is_nonsense_url.<locals>.<genexpr>x  s     Cdts#Cs     )z	/wp-adminz/wp-includesz/wp-content/uploadsz/adminz/loginz/signinz/signupz	/registerz	/checkoutz/cartz/accountz/profilez/searchz/404z/errorz/.gitz/.svnz/.hgz/cgi-binz/scriptsz	/includesc              3  &   K   | ]  }|v  
 y wr  rS  )rT  ncpr   s     r$   rU  z2AsyncUrlSeeder._is_nonsense_url.<locals>.<genexpr>  s     8ssd{8   c              3  &   K   | ]  }|v  
 y wr  rS  )rT  rW   r4  s     r$   rU  z2AsyncUrlSeeder._is_nonsense_url.<locals>.<genexpr>  s     `w)#`rS  )z?print=z&print=z/print/z_print.r  )r  z/enz/dez/frz/esz/itF)	ru   r1  r)  r   r  r[  rT   rK   rs   )	r   rV   r)  r5  utility_filesr7  non_content_pathsr   r4  s	          @@r$   r  zAsyncUrlSeeder._is_nonsense_url1  s    
 IIK	 	*#{{  " ==NO $--0K"L
 C]CCT ZZ_
C
CC
 8&788 `3_`` tzz#!#4\(\rZ   c           	        t         s#| j                  ddd       dgt        |      z  S |r|sdgt        |      z  S |j                         j	                         }|D cg c]   }|j                         j	                         " }}t        d |D              rdgt        |      z  S 	 ddlm}  ||      }|j                  |      }t        |      dk(  rg S t        |      }	t        |      }
|
|	k(  rd	gt        |      z  S |D cg c]  }||	z
  |
|	z
  z   }}|S c c}w c c}w # t        $ r9}| j                  d
dd
t        |      id       dgt        |      z  cY d}~S d}~ww xY w)z4Calculate BM25 scores for documents against a query.r   z/rank_bm25 not installed. Returning zero scores.r   r   r  c              3  8   K   | ]  }t        |      d k(    yw)r   N)rK   )rT  r   s     r$   rU  z7AsyncUrlSeeder._calculate_bm25_score.<locals>.<genexpr>  s     7s3x1}7s   r   )	BM25Okapir0  r   z&Error calculating BM25 scores: {error}r   N)rb  r   rK   ru   rT   all	rank_bm25rY  
get_scoresr.  r   r    rm   )r   r  	documentsr=  r   tokenized_docsrY  r  rn  	min_score	max_scorerm  normalized_scoresr   s                 r$   rg  z$AsyncUrlSeeder._calculate_bm25_score  s   IILR\  ^53y>))I53y>)) {{}**,9BC##))+++-CC 77753y>))	*+^,D__\2F 6{a	FIFI I%us6{** ]c cSX%)"3	I8M!N c c$$7 D2 !d  	*IIgG%s1v.J  @53y>))	*sB   %D(.D% )D% D% D D%  D% %	E'..E"E'"E'c                   K   | j                   rD| j                  r7| j                  j                          d{    | j                  ddd       yyy7 w)z#Close the HTTP client if we own it.Nr   zClosed HTTP clientr   r   )r   r   r  r   r   s    r$   closezAsyncUrlSeeder.close  sI     ++$$&&&IIg3ID "-&s   6AAAc                   K   | S w)zAsync context manager entry.rS  rc  s    r$   
__aenter__zAsyncUrlSeeder.__aenter__  s     s   c                @   K   | j                          d{    y7 w)zAsync context manager exit.NF)rd  )r   exc_typeexc_valexc_tbs       r$   	__aexit__zAsyncUrlSeeder.__aexit__  s     jjl 	s   c           	       K   | j                   j                         rt        j                         | j                   j                         j                  z
  | j
                  j                         k  rI| j                  ddd| j                   id       | j                   j                         j                         S | j                  dddt        id       	 t        j                         4 d {   }|j                  t        d	       d {   }|j                          |j                         d
   d   }| j                   j!                  |       | j                  ddd|id       |cd d d       d {    S 7 7 q7 	# 1 d {  7  sw Y   y xY w# t        j"                  $ r&}| j                  dddt%        |      id        d }~wt        j&                  $ r1}| j                  ddd|j(                  j*                  id        d }~wt,        $ r&}| j                  dddt%        |      id        d }~ww xY ww)Nr   z*Loading latest CC index from cache: {path}r   r   r   z-Fetching latest Common Crawl index from {url}rV   rq  rK  r   idrX  z4Successfully fetched and cached CC index: {index_id}r   r   z-Network error fetching CC index info: {error}z0HTTP error fetching CC index info: {status_code}ry  z0Unexpected error fetching CC index info: {error})r   r-   r   r   r   r   r6   r   	read_textrs   COLLINFO_URLr   r   r1   r  r/   
write_textr  rm   r  r  ry  r    )r   r  jr  r   s        r$   r+  zAsyncUrlSeeder._latest_index  s      '')tyy{4;P;P;U;U;W;`;`/`dhdldldzdzd|.|IIfJ$d&;&;<*  N((224::<<		&I.J 	 	@	((*  a%%b%99""$ffhqk$'%%005		)%[",c!2
  D  9    !! 	IIgN%s1v.J  @$$ 	IIgQ+QZZ-C-CD*  V 	IIgQ%s1v.J  @	s   CIF .E=/F 2FE?AF+F 7F8F <I=F ?FF F	F
FF IF I +!GI ",HI !H;;I  I)
r   r   r   zOptional[httpx.AsyncClient]r   zOptional[AsyncLoggerBase]r   z"Optional[Union[str, pathlib.Path]]r   zOptional[Union[str, Path]])r   )r   rm   r   rm   r   rm   r   r   )r   rm   rV   rm   r  r   )r   rm   rV   rm   r  zOptional[Dict[str, Any]])r   rm   rV   rm   r@   Dict[str, Any]r  None)r   rm   r7  'SeedingConfig'r  r%  )rC  zSequence[str]r7  rt  r  zDict[str, List[Dict[str, Any]]])Nrq  r   )
rE   	List[str]r7  zOptional['SeedingConfig']r&  intr   rv  r  r%  )r;  r%  r7  rt  r  r%  )rV   rm   r  r  )r   rm   rW   rm   r   bool)F)rV   rm   re   bytes)rV   rm   )NNr  T)rV   rm   r  r%  r   rw  rL  rw  r   rv  r   rw  r  r  r  zOptional[float]r  rm   r  rw  )rV   rm   r   rv  r  rw  )r   i   i   )
rV   rm   r   rv  r	  rv  r  rv  r  rv  )rP  rr  r  rm   )r  rm   rV   rm   r  re  )rV   rm   r  rw  )r  rm   r]  ru  r  zList[float])r  rm   )__name__
__module____qualname____doc__TTLr   r   r   r   r   rE   rF  r`  r5  r{  r   r   r  r  r  r  r  rc  rd  r  rg  rd  rf  rk  r+  rS  rZ   r$   r   r     s   F .2,0=A1588 ,8 *	8 ;8 /8B<4
y?y?*y? -y?v!!  ! 
)	!L -1^^ *^ 	^
 ^ 
^@ D<5pp[dgRl` \`W]044!%4034>B4KX4)84QT4 *.4l. ll l 	l
 l l^32jV%p^@,*^ErZ   r   )r!   rx  r  r  r  )
r;   pathlib.Pathr<   rv  r=   rw  r>   r  r  rw  )r;   r~  r  ru  )
r;   r~  rE   ru  rH   rm   r+   r  r  rs  )rV   rm   rW   rm   r  rw  )r   rm   r  rr  )>r|  
__future__r   r   r  r  r   ior/   r   r   r   r   r   r   r   r   typingr   r	   r
   r   r   r   r   r1  r   r   r   rS   lxmlr   ro   r   r   ImportErrorr  r  r[  rb  async_loggerr   r   r   rY  r   ro  r}  r   r   r|   r~   r   ry   r   r%   rC   rF   rM   rY   r   r   rS  rZ   r$   <module>r     s   #     	  	  	  2 2  G G G '  &DJH 7 !, = Q2::|DD
 bjjBBDDIBJJ.rtt<	2::DbddL& &*	/// / #	/
 
/d
  #	
 
,SFVg g  D
  J
  Hs6   $E 3E' :E4 E$#E$'E10E14E>=E>