
    /iTR                    b   d Z ddlmZmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlmZmZ ddlZddlZddlZddlZddlZddlmZmZ ddlZddlmZ ddlmZ dd	lmZmZmZ dd
l m!Z!m"Z" ddl#Z$e G d d             Z%e G d d             Z& G d de      Z' G d de'      Z( G d de'      Z) G d d      Z*y)a  
Adaptive Web Crawler for Crawl4AI

This module implements adaptive information foraging for efficient web crawling.
It determines when sufficient information has been gathered to answer a query,
avoiding unnecessary crawls while ensuring comprehensive coverage.
    )ABCabstractmethod)DictListOptionalSetTupleAnyUnion)	dataclassfieldN)defaultdictCounter)Path)AsyncWebCrawler)CrawlerRunConfigLinkPreviewConfig	LLMConfig)LinkCrawlResultc                      e Zd ZU dZ ee      Zee   e	d<    ee
      Zee   e	d<    ee
      Zee   e	d<   dZee	d<    ee      Zeeef   e	d<    ed	       Zeeef   e	d
<    ed       Zeeef   e	d<    ed       Zeeee   f   e	d<   dZee	d<    ee
      Zee   e	d<    ee
      Zee   e	d<   dZee   e	d<   dZee   e	d<    ee
      Z ee   e	d<   dZ!ee   e	d<    ee
      Z"ee#ee   ef      e	d<   dZ$ee	d<   de%ee&f   fdZ'e(de%ee&f   dd fd       Z)e*dedefd       Z+e*d efd!       Z,y)"
CrawlStatez-Tracks the current state of adaptive crawling)default_factorycrawled_urlsknowledge_basepending_links querymetricsc                       t        t              S Nr   int     U/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/adaptive_crawler.py<lambda>zCrawlState.<lambda>$   s    [QTEU r%   term_frequenciesc                       t        t              S r!   r"   r$   r%   r&   r'   zCrawlState.<lambda>%   s    UXIY r%   document_frequenciesc                       t        t              S r!   )r   setr$   r%   r&   r'   zCrawlState.<lambda>&   s    kZ]N^ r%   documents_with_termsr   total_documentsnew_terms_historycrawl_orderNkb_embeddingsquery_embeddingsexpanded_queriescoverage_shapesemantic_gapsembedding_modelpathc           
         t        |      }|j                  j                  dd       i dt        | j                        d| j
                  D cg c]  }| j                  |       c}d| j                  D cg c]  }|j                          c}d| j                  d| j                  dt        | j                        d	t        | j                        d
| j                  j                         D ci c]  \  }}|t        |       c}}d| j                   d| j"                  d| j$                  d| j&                  | j&                  j)                         ndd| j*                  | j*                  j)                         ndd| j,                  d| j.                  d| j0                  }t3        |d      5 }t5        j6                  ||d       ddd       yc c}w c c}w c c}}w # 1 sw Y   yxY w)z"Save state to disk for persistenceTparentsexist_okr   r   r   r   r   r(   r*   r-   r.   r/   r0   r1   Nr2   r3   r5   r6   w   )indent)r   parentmkdirlistr   r   _crawl_result_to_dictr   
model_dumpr   r   dictr(   r*   r-   itemsr.   r/   r0   r1   tolistr2   r3   r5   r6   openjsondump)selfr7   crlinkkv
state_dictfs           r&   savezCrawlState.save5   s   Dz$6
D!2!23
H[H[\"t99"=\
 D<N<NODdoo/O
 TZZ	

 t||
 T%:%: ;
 #D)B)B$C
 #D<U<U<[<[<]$^DAqQQZ$^
 t33
  !7!7
 4++
 D<N<N<ZT//668`d
 $BWBWBc 5 5 < < >im
  5 5
  T//!
" t33#

( $_ 	/IIj!A.	/ 	/% ]O
 %_	/ 	/s   G 9G%7G*
>G00G9returnc                    t        |      }t        |d      5 }t        j                  |      }ddd        |        }t	        d         |_        |d   D cg c]  }| j                  |       c}|_        |d   D cg c]  }t        di | c}|_	        |d   |_
        |d   |_        t        t        |d         |_        t        t        |d	         |_        t        t        |d
   j!                         D ci c]  \  }}|t	        |       c}}      |_        |d   |_        |d   |_        |d   |_        |j+                  d      t-        j.                  |d         nd|_        |j+                  d      t-        j.                  |d         nd|_        |j+                  dg       |_        |j+                  dg       |_        |j+                  dd      |_        |S # 1 sw Y   xY wc c}w c c}w c c}}w )zLoad state from diskrNr   r   r   r   r   r(   r*   r-   r.   r/   r0   r1   r2   r3   r5   r6   r   r$   )r   rG   rH   loadr,   r   _dict_to_crawl_resultr   r   r   r   r   r   r#   r(   r*   rE   r-   r.   r/   r0   getnparrayr1   r2   r3   r5   r6   )	clsr7   rP   rO   stated	link_dictrM   rN   s	            r&   rU   zCrawlState.loadR   s    Dz$_ 	&1J	&  N!;<FPQaFbc 9 9! <cBL_B]^Yt0i0^ )"9-!,S*=O2P!Q%0jAW6X%Y"%0ZXnMoMuMuMw6xTQq#a&y6x%y" *+< =",-@"A&}5 HR~~VeGfGrbhhz/'BCx|MW^^\nMoM{*5G*H!I  BF!+0BB!G(nn_bA */@" E1	& 	&
  d^
 7ys   G$G1>G6<G;$G.rK   c                 @   d}t        | d      rN| j                  rBt        | j                  d      r| j                  j                  }nt        | j                        }| j                  |t        | d      r| j
                  ni t        | d      r| j                  dS i dS )z(Convert CrawlResult to serializable dictr   markdownraw_markdownlinksmetadataurlcontentra   rb   )hasattrr_   r`   strrd   ra   rb   )rK   markdown_contents     r&   rB   z CrawlState._crawl_result_to_dictp   s     2z"r{{r{{N3#%;;#;#; #&r{{#3  66'!(W!5RXX2'.r:'>	
 	
 EG	
 	
r%   r\   c           	           G d d       G fdd      } || d   | j                  dd      | j                  di       | j                  d	i       
      S )z Convert dict back to CrawlResultc                       e Zd Zd Zy)6CrawlState._dict_to_crawl_result.<locals>.MockMarkdownc                     || _         y r!   r`   rJ   re   s     r&   __init__z?CrawlState._dict_to_crawl_result.<locals>.MockMarkdown.__init__   
    $+!r%   N__name__
__module____qualname__ro   r$   r%   r&   MockMarkdownrk          ,r%   ru   c                       e Zd Z fdZy)9CrawlState._dict_to_crawl_result.<locals>.MockCrawlResultc                 J    || _          |      | _        || _        || _        y r!   )rd   r_   ra   rb   )rJ   rd   re   ra   rb   ru   s        r&   ro   zBCrawlState._dict_to_crawl_result.<locals>.MockCrawlResult.__init__   s$     ,W 5"
 (r%   Nrq   ru   s   r&   MockCrawlResultrx      s    )r%   r{   rd   re   r   ra   rb   rc   )rW   )r\   r{   ru   s     @r&   rV   z CrawlState._dict_to_crawl_result   sW    	, 	,	) 	) %EE)R(%%$UU:r*	
 	
r%   )-rr   rs   rt   __doc__r   r,   r   r   rg   __annotations__rA   r   r   r   r   r   r   rD   r   r   floatr(   r#   r*   r-   r.   r/   r0   r1   r   r
   r2   r3   r4   r5   r	   r6   r   r   rQ   classmethodrU   staticmethodrB   rV   r$   r%   r&   r   r      s   7"37L#c(7(-d(CND%C %d ;M4:;E3O %d ;GT#u*; (-=U'Vd38nV+0AY+Z$sCx.Z05F^0_$sCH}-_OS $)#>tCy>"48Kc8 $(M8C='&*hsm*"'"=d3i=$(NHSM(5:45PM4d5k5012POS/sDy) /: c4i( \  : 
+ 
$ 
 
" 
 
 
r%   r   c                   
   e Zd ZU dZdZeed<   dZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZeed <   dZeeeef      ed!<   d"Zeed#<   d$Z eed%<   dZ!eed&<   d
Z"eed'<   dZ#eed(<   d)Z$eed*<   dZ%eed+<   dZ&eed,<   d$Z'eed-<   d
Z(eed.<   dZ)eed/<   dZ*eed0<   d1Z+eed2<   d3Z,eed4<   d5 Z-e.d6ee   fd7       Z/y)8AdaptiveConfigz#Configuration for adaptive crawlingffffff?confidence_threshold   	max_depth   	max_pages   top_k_links皙?min_gain_thresholdstatisticalstrategy皙?saturation_thresholdconsistency_threshold皙?coverage_weight333333?consistency_weightsaturation_weight      ?relevance_weightnovelty_weight皙?authority_weightF
save_stateN
state_path&sentence-transformers/all-MiniLM-L6-v2r6   embedding_llm_config
   n_query_variations333333?coverage_thresholdalpha_shape_alpha"embedding_min_confidence_thresholdembedding_coverage_radius      ?embedding_k_expembedding_nearest_weightembedding_top_k_weightembedding_overlap_threshold"embedding_min_relative_improvementembedding_validation_min_score embedding_quality_min_confidenceffffff? embedding_quality_max_confidence-?embedding_quality_scale_factorc                 Z   d| j                   cxk  rdk  sJ d        J d       | j                  dkD  sJ d       | j                  dkD  sJ d       | j                  dkD  sJ d       d| j                  cxk  rdk  sJ d        J d       | j
                  | j                  z   | j                  z   }t        |dz
        d	k  s
J d
|        | j                  | j                  z   | j                  z   }t        |dz
        d	k  s
J d|        d| j                  cxk  rdk  sJ d        J d       | j                  dkD  sJ d       d| j                  cxk  rdk  sJ d        J d       d| j                  cxk  rdk  sJ d        J d       t        | j                  | j                  z   dz
        d	k  sJ d       d| j                   cxk  rdk  sJ d        J d       d| j"                  cxk  rdk  sJ d        J d       d| j$                  cxk  rdk  sJ d        J d       d| j&                  cxk  rdk  sJ d        J d       d| j(                  cxk  rdk  sJ d        J d       | j*                  dkD  sJ d       d| j,                  cxk  rdk  sJ d        J d       y)z!Validate configuration parametersr      z,confidence_threshold must be between 0 and 1zmax_depth must be positivezmax_pages must be positiveztop_k_links must be positivez*min_gain_threshold must be between 0 and 1r   gMbP?z$Coverage weights must sum to 1, got z(Link scoring weights must sum to 1, got z1embedding_coverage_radius must be between 0 and 1z embedding_k_exp must be positivez0embedding_nearest_weight must be between 0 and 1z.embedding_top_k_weight must be between 0 and 1zEmbedding weights must sum to 1z3embedding_overlap_threshold must be between 0 and 1z:embedding_min_relative_improvement must be between 0 and 1z6embedding_validation_min_score must be between 0 and 1z8embedding_quality_min_confidence must be between 0 and 1z8embedding_quality_max_confidence must be between 0 and 1z/embedding_quality_scale_factor must be positivez:embedding_min_confidence_threshold must be between 0 and 1N)r   r   r   r   r   r   r   r   absr   r   r   r   r   r   r   r   r   r   r   r   r   r   )rJ   
weight_sums     r&   validatezAdaptiveConfig.validate   s(   D--22b4bb2b4bb2~~!?#??!~~!?#??!!#C%CC#D++0q0^2^^0^2^^0 ))D,C,CCdF\F\\
:#$u,a0TU_T`.aa,**T-@-@@4CXCXX
:#$u,e0XYcXd.ee, 4115A5j7jj5j7jj5##a'K)KK'D116Q6j8jj6j8jj6D//414f6ff4f6ff440043N3NNQTTUX]]  	A  `A  	A]D4499p;pp9p;pp94::>Q>|@||>|@||>D77<1<v>vv<v>vv<D99>Q>z@zz>z@zz>D99>Q>z@zz>z@zz>22Q6i8ii6D;;@q@~B~~@~B~~@r%   rR   c                 "   | j                   yt        | j                   t              r| j                   S | j                   j                  | j                   j                  t        | j                   dd      t        | j                   dd      t        | j                   dd      t        | j                   dd      t        | j                   dd      t        | j                   dd      t        | j                   dd      t        | j                   d	d      d

S )z<Convert LLMConfig to dict format for backward compatibility.Nbase_urltemperature
max_tokenstop_pfrequency_penaltypresence_penaltystopn)
provider	api_tokenr   r   r   r   r   r   r   r   )r   
isinstancerD   r   r   getattrrJ   s    r&   _embedding_llm_config_dictz)AdaptiveConfig._embedding_llm_config_dict   s     $$,d//6,,, 11::22<< 9 9:tL"4#<#<mTR!$";";\4PT66F!()B)BDWY]!^ '(A(ACUW[ \D55vtD22C>
 	
r%   )0rr   rs   rt   r|   r   r~   r}   r   r#   r   r   r   r   rg   r   r   r   r   r   r   r   r   r   boolr   r   r6   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   propertyr   r$   r%   r&   r   r      s   -"%%%IsIsK ##!Hc! #&%%#&5& OU  ##"u" "e!NE!e! J $J$ DOSC=A(5D#9:A   $$"u" 14&3 (+u* !OU  '*e)$'E'
 *.-
 14&3 -0"E/
 /2$e1.2$e2,1"E1: 
HTN 
 
r%   r   c            
           e Zd ZdZededefd       Zededede	e
eef      fd       Zedededefd       Zedede	e   dd	fd
       Zy	)CrawlStrategyz+Abstract base class for crawling strategiesr[   rR   c                    K   yw)z@Calculate overall confidence that we have sufficient informationNr$   )rJ   r[   s     r&   calculate_confidencez"CrawlStrategy.calculate_confidence        	   configc                    K   yw)z/Rank pending links by expected information gainNr$   rJ   r[   r   s      r&   
rank_linkszCrawlStrategy.rank_links  r   r   c                    K   yw)!Determine if crawling should stopNr$   r   s      r&   should_stopzCrawlStrategy.should_stop$  r   r   new_resultsNc                    K   yw)#Update state with new crawl resultsNr$   )rJ   r[   r   s      r&   update_statezCrawlStrategy.update_state)  r   r   )rr   rs   rt   r|   r   r   r~   r   r   r   r	   r   r   r   r   r   r   r$   r%   r&   r   r     s    5
 u   j . TRWX\^cXcRdMe   z > d   
 kAR W[  r%   r   c            	          e Zd ZdZd ZdedefdZdedefdZdedefdZ	dedefdZ
ded	edeeeef      fd
ZdededefdZdededefdZdedefdZded	edefdZdedee   ddfdZdedee   fdZdedee   fdZy)StatisticalStrategyz1Pure statistical approach - no LLM, no embeddingsc                 .    i | _         d| _        d| _        y )Ng333333?g      ?)	idf_cachebm25_k1bm25_br   s    r&   ro   zStatisticalStrategy.__init__2  s    r%   r[   rR   c                 
  K   |j                   sy| j                  |      }| j                  |      }| j                  |      }||j                  d<   ||j                  d<   ||j                  d<   d|z  d|z  z   d|z  z   }|S w)z@Calculate confidence using coverage, consistency, and saturation        coverageconsistency
saturationr   r   )r   _calculate_coverage_calculate_consistency_calculate_saturationr   )rJ   r[   r   r   r   
confidences         r&   r   z(StatisticalStrategy.calculate_confidence7  s     ##++E211%8//6
 %-j!'2m$&0l# 8^cK&77#
:JJ
s   BBc                    |j                   r|j                  dk(  ry| j                  |j                   j                               }|syg }|j                  r#t        |j                  j                               nd}|D ]  }|j                  j                  |d      }|j                  j                  |d      }|dkD  rd||j                  z  }|dkD  r1t        j                  d|z         t        j                  d|z         z  nd}	|dd|	z  z   z  }
|j                  |
       |j                  d        t        |      t        |      z  }t        dt        j                  |            S )zCoverage scoring - measures query term presence across knowledge base
        
        Returns a score between 0 and 1, where:
        - 0 means no query terms found
        - 1 means excellent coverage of all query terms
        r   r   r   r   r   )r   r.   	_tokenizelowerr(   maxvaluesrW   r*   mathlogappendsumlenminsqrt)rJ   r[   query_termsterm_scoresmax_tftermtfdfdoc_coveragefreq_signal
term_scorer   s               r&   r   z'StatisticalStrategy._calculate_coverageJ  sM    {{e33q8nnU[[%6%6%899>9O9OU++2245UV 	(D''++D!4B++//a8BAv!E$9$99 JPRSdhhq2v.!f*1EEYZ *Q{1B-BC
"":.""3'	($ {#c+&66 3		(+,,r%   c                    t        |j                        dk  ryg }t        t        |j                              D ]  }t        |dz   t        |j                              D ]  }t        | j	                  |j                  |               }t        | j	                  |j                  |               }|sT|sWt        ||z        t        ||z        z  }|j                  |         |rt        |      t        |      z  }|S d}|S )zQInformation overlap between pages - high overlap suggests coherent topic coverager=   r   r   r   )r   r   ranger,   _get_document_termsr   r   )	rJ   r[   overlapsijterms_iterms_joverlapr   s	            r&   r   z*StatisticalStrategy._calculate_consistencys  s    u##$q( s5//01 		-A1q5#e&:&:";< -d66u7K7KA7NOPd66u7K7KA7NOPw!'G"34s7W;L7MMGOOG,-		- h-#h-7K  Kr%   c                    |j                   syt        |j                         dk  ry|j                   d   dkD  r|j                   d   nd}|j                   d   dkD  r|j                   d   nd}d||z  z
  }t        dt        |d            S )zIDiminishing returns indicator - are we still discovering new information?r   r=   r   r   r   )r/   r   r   r   )rJ   r[   recent_rateinitial_rater   s        r&   r   z)StatisticalStrategy._calculate_saturation  s    &&u&&'!+ 6;5L5LR5PST5Te--b1Z[5:5L5LQ5ORS5Su..q1YZ +45
3J,--r%   r   c                 ^  K   g }|j                   D ]  }|j                  |j                  v r| j                  ||      }| j	                  ||      }d}|j
                  |z  |j                  |z  z   |j                  |z  z   }|j                  ||f        |j                  d d       |S w)z'Rank links by expected information gainr   c                     | d   S Nr   r$   xs    r&   r'   z0StatisticalStrategy.rank_links.<locals>.<lambda>  s
    ! r%   Tkeyreverse)
r   hrefr   _calculate_relevance_calculate_noveltyr   r   r   r   sort)	rJ   r[   r   scored_linksrL   	relevancenovelty	authorityscores	            r&   r   zStatisticalStrategy.rank_links  s     '' 	/DyyE... 11$>I--dE:GI ,,y8))G34++i78E u.!	/& 	nd;s   B+B-rL   c                 .   |j                   r|sydj                  t        d|j                  xs d|j                  xs d|j
                  r,|j
                  j                  di       j                  dd      nd|j
                  r,|j
                  j                  di       j                  dd      nd|j
                  r,|j
                  j                  di       j                  dd      ndg            j                         }|sy|j                  r|j                  d	kD  r|j                  S t        | j                  |j                   j                                     }t        | j                  |            }|syt        ||z        t        |      z  }|S )
z3BM25 relevance score between link preview and queryr    Nr   metatitledescriptionkeywordsr   )r   joinfiltertextr  	head_datarW   r   contextual_scorer,   r   r   )rJ   rL   r[   	link_textr   
link_termsr  s          r&   r  z(StatisticalStrategy._calculate_relevance  sR   {{$ HHVDIIOJJ"?C~~DNNvr*..w;SUEI^^DNNvr*..}bAY[BF..DNNvr*..z2>VX+
   EG 	    T%:%:Q%>((( $..):):)<=>	23
kJ./#k2BBr%   c                 h   |j                   sydj                  t        d|j                  xs d|j                  xs d|j
                  r|j
                  j                  dd      nd|j
                  r|j
                  j                  dd      nd|j
                  r|j
                  j                  dd      ndg            j                         }t        | j                  |            }|syt        |j                  j                               }||z
  }|rt        |      t        |      z  }|S d	}|S )
z9Estimate how much new information this link might provider   r  Nr   r  r   r!  r   r   )r   r"  r#  r$  r  r%  rW   r   r,   r   r(   keysr   )rJ   rL   r[   r'  r(  existing_terms	new_termsr  s           r&   r  z&StatisticalStrategy._calculate_novelty  s
   ## HHVDIIOJJ"/3~~DNNw+259^^DNN}b126..DNNz2.b+
   EG 	 	23
 U3388:;/	6@#i.3z?2 GJr%   c                 V   d}|j                   sy|j                   j                         }d|v sd|v r|dz  }d|v sd|v r|dz  }d|v sd	|v r|d
z  }|j                  d      r|d
z  }n|j                  d      r|dz  }|j                  d|z  d|j                  z  z   }t	        |d      S )zASimple authority score based on URL structure and link attributesr   r   z/docs/z/documentation/r   z/api/z/reference/z/guide/z
/tutorial/r   z.pdf)z.jpgz.pngz.gifr   r   r   )r  r   endswithintrinsic_scorer   )rJ   rL   r  rd   s       r&   _calculate_authorityz(StatisticalStrategy._calculate_authority  s    yyiioo s?/36SLEc>]c1SLE|s2SLE <<SLE\\23SLE +%K#(<(<"<<E5#r%   c                   K   |j                   j                  dd      }||j                  k\  ryt        |j                        |j
                  k\  ry|j                  sy|j                   j                  dd      |j                  k\  ryyw)r   r   r   Tr   F)r   rW   r   r   r   r   r   r   )rJ   r[   r   r   s       r&   r   zStatisticalStrategy.should_stop  s      ]]&&|S9
444 u!!"f&6&66 "" ==\3/63N3NNs   BB
r   Nc                   K   |D ]P  }t        |j                        }	 |j                  j                  }| j                  |j                               }t               }|D ]*  }|j                  |xx   dz  cc<   |j                  |       , |j                  }	|D ]I  }||j                  |   vs|j                  |xx   dz  cc<   |j                  |   j                  |	       K t        |j                        }
|
|z
  }|j                  j                  |       |xj                  dz  c_        |j                   j                  |j                         S y# t        $ r t        d|j                   d       d}Y Lw xY ww)r   zWarning: CrawlResult z has no markdown contentr   r   N)r   r(   r_   r`   AttributeErrorprintrd   r   r   r,   addr.   r-   r*   r/   r   r0   )rJ   r[   r   resultold_term_countre   termsterm_setr   doc_idnew_term_countr,  s               r&   r   z StatisticalStrategy.update_state&  ss    ! .	1F !7!78N //66  NN7==?3E uH #&&t,1,T"#
 **F  Au99$??..t494..t488@A !!7!78N&7I##**95 !!Q&! $$VZZ0].	1 " -fjj\9QRSs/   FEA:F0B*F$F>FFFr$  c                     t        j                  dd|      }|j                         }|D cg c]  }t        |      dkD  s| }}|S c c}w )z%Simple tokenization - can be enhancedz[^\w\s]r  r=   )resubsplitr   )rJ   r$  tokensts       r&   r   zStatisticalStrategy._tokenizeX  sJ     vvj#t, $2s1vz!22 3s   A Acrawl_resultc                 t    |j                   j                  xs d}| j                  |j                               S )z!Extract terms from a crawl resultr   )r_   r`   r   r   )rJ   rB  re   s      r&   r   z'StatisticalStrategy._get_document_termsc  s-    ''44:~~gmmo..r%   )rr   rs   rt   r|   ro   r   r~   r   r   r   r   r   r   r	   r   r   r  r  r0  r   r   r   r   rg   r   r   r$   r%   r&   r   r   /  s4   ;

 u &'- '- '-RJ 5 6.: .% ."j . TRWX\^cXcRdMe 6 j U @t J 5 4 % :z > d *01
 01kAR 01W[ 01d	c 	d3i 	/ /S	 /r%   r   c                      e Zd ZdZd#dedeeef   fdZdefdZ	de
e   defd	Zd
ededefdZd
ededefdZd$dededefdZd%dedefdZd&dede
e   fdZded
ede
eeef      fdZde
e   de
eeef      dede
eeef      fdZdedefdZdedede
eeef      fdZdedefdZdededefdZdedefd Zded!e
e    ddfd"Z!y)'EmbeddingStrategyz?Embedding-based adaptive crawling using semantic space coverageNr6   
llm_configc                     |xs d| _         || _        i | _        i | _        d| _        d | _        d | _        d | _        d| _        y )Nr   Fr   )	r6   rF  _embedding_cache_link_embedding_cache_validation_passed_distance_matrix_cache_kb_embeddings_hash_validation_embeddings_cache_kb_similarity_threshold)rJ   r6   rF  s      r&   ro   zEmbeddingStrategy.__init__l  sQ    .Z2Z$ "%'""' '+##' ,0)(,%r%   rR   c                     t        | d      r&| j                  r| j                  j                  }|r|S dt        j                  d      dS )z:Get embedding LLM config as dict with fallback to default.r   zopenai/text-embedding-3-smallOPENAI_API_KEYr   r   )rf   r   r   osgetenv)rJ   config_dicts     r&   _get_embedding_llm_config_dictz0EmbeddingStrategy._get_embedding_llm_config_dicty  sF    4"t{{++@@K"" 8#34
 	
r%   textsc                 r   K   ddl m} | j                         } |||| j                         d{   S 7 w)z&Get embeddings using configured methodr   get_text_embeddingsN)utilsrY  rU  r6   )rJ   rV  rY  r   s       r&   _get_embeddingsz!EmbeddingStrategy._get_embeddings  s>     .#BBD(   
 
 	
 
s   .757r2   r1   c                    |t        |      dk(  ryt        |j                        dk(  r|j                  dd      }t        |j                        dk(  r|j                  dd      }|t        j                  j                  |dd      z  }|t        j                  j                  |dd      z  }t        j                  ||j                        }d|z
  }|S )z3Compute distance matrix using vectorized operationsNr   r   r  Taxiskeepdims)r   shapereshaperX   linalgnormdotT)rJ   r2   r1   
query_normkb_normsimilarity_matrixdistance_matrixs          r&   _compute_distance_matrixz*EmbeddingStrategy._compute_distance_matrix  s      C$6!$; %%&!+/772>}""#q()11!R8M &		7GaZ^(__
"))..QQU."VV FF:wyy9 //r%   c                     |t        |      dk(  ry|t        |j                               nd}| j                  || j                  k7  r| j                  ||      | _        || _        | j                  S )z Get distance matrix with cachingNr   )r   hashtobytesrK  rL  rj  )rJ   r2   r1   kb_hashs       r&   _get_cached_distance_matrixz-EmbeddingStrategy._get_cached_distance_matrix  s|      C$6!$; 4A3L$},,./RV''/t///*.*G*GHXZg*hD''.D$***r%   r   n_syntheticc                   K   ddl m} t        |dz        }d| d| d}d}| j                  rTt	        | j                  t
              r| j                  }n-| j                  j                  | j                  j                  d}|r|j                  d	d
      nd
}|r|j                  d      nd} ||||d      }	t        j                  |	j                  d   j                  j                        }
ddl}|
d   j                         }|j!                  |       t#        dt        t%        |      dz              }|| d }|g|d|  z   }| j'                  |       d{   }|| _        ||fS 7 w)zJGenerate a point cloud representing the semantic neighborhood of the queryr   )perform_completion_with_backoffg?z	Generate z; variations of this query that explore different aspects: 'z'
        
        These should be queries a user might ask when looking for similar information.
        Include different phrasings, related concepts, and specific aspects.
        
        Return as a JSON array of strings.NrQ  r   zopenai/gpt-4o-minir   T)r   prompt_with_variablesr   json_responser   queriesr=   r   )rZ  rr  r#   rF  r   rD   r   r   rW   rH   loadschoicesmessagere   randomcopyshuffler   r   r[  _validation_queries)rJ   r   rp  rr  n_totalpromptllm_config_dictr   r   response
variationsry  other_queriesn_validationval_queriestrain_queriestrain_embeddingss                    r&   map_query_semantic_spacez*EmbeddingStrategy.map_query_semantic_space  s    : kC'( wi'bchbi j+ . ??$//40"&// !% 8 8!%!:!:#
 M\?&&z3GHau8GO''4T	2"(	
 ZZ 0 0 3 ; ; C CD
4 	 #9-224}% 1c#m"4s":;<#\MN3-,"?? "&!5!5m!DD $/ .. Es   EE.E,E.query_pointsalphac                 d   	 t        |      dk  ryt        j                  |      }t        j                  |d      t        j                  |d      |t        j
                  t        j                  j                  |t        j                  |d      z
  d            d}|S # t        $ r Y yw xY w)zEFind the minimal shape that covers all query points using alpha shaper   Nr   r^  r   )centerstdpointsradius)	r   rX   rY   meanr  r   rb  rc  	Exception)rJ   r  r  r   s       r&   compute_coverage_shapez(EmbeddingStrategy.compute_coverage_shape  s    	 < 1$
 88L1L '',Q7vvl3&&&rww|Z[?\0\cd!ef	H O 		s   B# BB# #	B/.B/	n_samplesc                     |g S g S )z*Sample points from the boundary of a shaper$   )rJ   r`  r  s      r&   _sample_boundary_pointsz)EmbeddingStrategy._sample_boundary_points(  s     =I 	r%   c                 B   g }|t        |      dk(  r|D ]  }|j                  |df        |S | j                  ||      }||D ]  }|j                  |df        |S t        j                  |d      }t        |      D ]  \  }}|j                  |||   f        |S )zLCalculate gap distances for all query variations using vectorized operationsr   r   r   r  )r   r   ro  rX   r   	enumerate)rJ   r1   r2   gapsq_embri  min_distancesr  s           r&   find_coverage_gapsz$EmbeddingStrategy.find_coverage_gaps4  s      C$6!$;) *UCL)*K ::;K][") *UCL)*K Q7 ""23 	3HAuKKa 012	3 r%   candidate_linksr  c                   $K   ddl m}m}m} ddl}g }g }	g }
i }|D ]E  }dj                  t        d|j                  xs d|j                  xs dt        |d      r(|j                  r|j                  j                  dd      nd|j                  r,|j                  j                  di       j                  dd      ndg            }|j                         s|j                  |j                   d	| j!                               j#                         }|| j$                  v r| j$                  |   ||j                  <   $|	j'                  |       |
j'                  |       H |
r| j)                         } ||
|| j*                         d{   }t-        |	|
|      D ]`  \  }}}|j                  |j                   d	| j!                               j#                         }|| j$                  |<   |||j                  <   b t        | d
      r| j.                  j0                  nd$|D ]  }|j                  |vr||j                     }|sd}nFd}d}|D ].  \  }}|$kD  s |||      }||k  s||z
  }|dz  }||z  }|dz  }0 t3        $fd|D              }|dkD  r||z  }nd}|t5        |      dkD  r|t6        j8                  j;                  |      z  }|t6        j8                  j;                  |dd      z  }t7        j<                  ||      } t7        j>                  |       }!t        | d
      r| j.                  j@                  nd}"|!|"kD  r	|!|"z
  dz  }#nd}#nd}#|d|#z
  z  }t        |d      r!|jB                  r|dz  |jB                  dz  z   }|j'                  ||f        tE        |d d      S 7 .w)z0Select links that most efficiently fill the gapsr   )cosine_distancecosine_similarityrY  r   Nr  r   r  r   :r   r   r   r=   c              3   4   K   | ]  \  }}|kD  sd   yw)r   Nr$   ).0_r\   coverage_radiuss      r&   	<genexpr>z?EmbeddingStrategy.select_links_for_expansion.<locals>.<genexpr>  s     'Rdaa/>Q'Rs   Tr]  r   r&  r   c                     | d   S r  r$   r  s    r&   r'   z>EmbeddingStrategy.select_links_for_expansion.<locals>.<lambda>  s
    !A$ r%   r  )#rZ  r  r  rY  hashlibr"  r#  r$  r  rf   r  rW   r%  stripmd5r  encode	hexdigestrI  r   rU  r6   zipr   r   r   r   rX   rb  rc  rd  r   r   r&  sorted)%rJ   r  r  r1   r  r  rY  r  r  links_to_embedtexts_to_embedlink_embeddings_maprL   r'  	cache_keyr   new_embeddingsr$  	embeddinglink_embeddingr  gaps_helpedtotal_improvement	gap_pointgap_distancenew_distanceimprovementscaled_improvementgaps_needing_helpgap_reduction_score	link_normrg  similaritiesmax_similarityoverlap_thresholdoverlap_penaltyr  s%                                       @r&   select_links_for_expansionz,EmbeddingStrategy.select_links_for_expansionR  s     	SR  # 	1D		R

 b4;D&4Idii		mR0]_IM""62.22="E]_	/ " I ??$  tyyk9+$>$E$E$GHRRTI D666151K1KI1V#DII.%%d+%%i0)	1. #'#F#F#H #6~G[]a]q]q#rrN *-^^^)\ ;%dI#KK499+Qtf(=(D(D(FGQQS	8A**951:#DII.; DK4QYCZ$++??`c $ :	/Dyy 330;N  $%!/3 
-+I|#o5'6~y'Q',6*6*EK1<q.-1CC-'1,K
- %('Rd'R$R!$q(*;>O*O'*+' !,]1Ca1G .1O OI+biinn]QR]an.bbG $&66'9#=L%'VVL%9N T[[_aiSj(O(Opt%%(99+9<M+MQR*R*+&'O ,q?/BC 4!349N9N!CK$*?*?#*EEEu.u:	/x lEEQ ss    FN?N<CN?#N?2EN?r[   c                   K   |j                   |j                  yt        |j                         dk(  st        |j                        dk(  ryt        j                  |j                  t        j
                        }t        j                  |j                   t        j
                        }|t        j                  j                  |dd      dz   z  }|t        j                  j                  |dd      dz   z  }||j                  z  j                  d	      }t        | j                  d
d      }|t        ||k\  j                               nt        |j                               }||j                  d<   t        |j                               |j                  d<   t        t        j                  |            |j                  d<   |S w)u&   Coverage-based learning score (0–1).Nr   r   )dtyper   Tr]  g:0yE>r  coverage_taucoverage_scoreavg_best_similaritymedian_best_similarity)r1   r2   r   rX   asarrayfloat32rb  rc  re  r   r   r   r~   r  r   median)rJ   r[   QDbesttaur  s          r&   r   z&EmbeddingStrategy.calculate_confidence  sf     &%*@*@*Hu""#q(C0F0F,G1,L JJu--RZZ@JJu**"**=	RYY^^AA^5<<	RYY^^AA^5<< ACC}}!}$ dkk>48/2ts{((*+E$))+DV +0&'/4TYY[/A+,27		$2H./s   GG
r   c                   K   || _         t               }g }|j                  D ]V  }|j                  |j                  vs|j                  |vs+|j                  |       |j                  |j                         X |sg S | j                  |j                  |j                        }|D cg c]  }|d   j                         |d   f c}|_        | j                  |||j                         d{   S c c}w 7 	w)z!Main entry point for link rankingr   r   N)r   r,   r   r  r   r   r5  r  r1   r2   rF   r5   r  )rJ   r[   r   	seen_urlsuncrawled_linksrL   r  gs           r&   r   zEmbeddingStrategy.rank_links+  s       E	'' 	)Dyy 2 22tyy	7Q&&t,dii(	)
 I &&""
 ?CC!qt4C 44
 
 	
 D
s)   ;C<C<AC<)C5(C<0C:1
C<c                   K   t        | d      r| j                  s|j                  j                  dd      S | j                  (| j                  | j                         d{   | _        | j                  }|j                  t        |j                        dk(  ry| j                  ||j                        }|yt        j                  |d      }d|z
  }t        j                  |      }||j                  d	<   |S 7 w)
z5Validate coverage using held-out queries with cachingr|  r   r   Nr   r   r  r   validation_confidence)rf   r|  r   rW   rM  r[  r1   r   rj  rX   r   r  )rJ   r[   val_embeddingsri  r  scoresr  s          r&   validate_coveragez#EmbeddingStrategy.validate_coverageJ  s     t234;S;S==$$\377
 ,,46:6J6J4KcKc6d0dD-:: &#e.A.A*Ba*G 77H[H[\" Q7}$ !#1F-.$$3 1es   AC?!C="BC?c                   K   |j                   j                  dd      }t        |d      r|j                  nd}||k  r7t	        |j
                        dkD  rd|j                   d<   d|j                   d	<   yt	        |j
                        |j                  k\  s|j                  syt        |d
      sg |_        |j                  j                  |       t	        |j                        dk  ryt        t        |j                  dd |j                  dd             }t        d |D              t	        |      z  }||j                   d<   t        | d      r| j                  j                  |z  nd|z  }||k  rh| j                  |       d{   }t        | d      r| j                  j                   nd}	||	kD  rd|j                   d<   d| _        yd|j                   d<   y7 Tw)z(Stop based on learning curve convergencer   r   r   r   r   !below_minimum_relevance_thresholdstopped_reasonTis_irrelevantconfidence_historyr=   FNr  r   c              3   >   K   | ]  \  }}t        ||z
          y wr!   )r   )r  abs      r&   r  z0EmbeddingStrategy.should_stop.<locals>.<genexpr>  s     GTQc!a%jGs   avg_improvementr   r   converged_validatedlow_validation)r   rW   rf   r   r   r   r   r   r  r   rA   r  r   r   r   r  r   rJ  )
rJ   r[   r   r   min_confidence_thresholdimprovement_diffsr  min_relative_improvement	val_scorevalidation_mins
             r&   r   zEmbeddingStrategy.should_stopn  s    ]]&&|S9
 QXX^  aE  QF6#L#L  LO 00S9K9K5Lq5P.QEMM*+-1EMM/* u!!"f&6&66e>Q>Q u23')E$  ''
3 u''(1, U%=%=cr%BED\D\]^]_D`!ab G5FGG#N_J``+:'(bijnpxby4;;#Q#QT^#^  @C  FP  @P 55"44U;;I LSSWYaKbT[[GGhkN >)2G./*.'2B./  <s   FG!
GAG!c                     |j                   j                  dd      }|j                   j                  dd      }t        | d      r| j                  j                  nd}t        | d      r| j                  j
                  nd}t        | d      r| j                  j                  nd}t        | d      r| j                  j                  nd}| j                  r$||kD  r|dk  r|}|S |dkD  r|}|S ||dz
  |z  z   }|S |d	z  }|S )
z4Calculate quality-based confidence score for displaylearning_scorer   r  r   r   r   r   r   r   )	r   rW   rf   r   r   r   r   r   rJ  )	rJ   r[   r  validation_scorer  quality_minquality_maxscale_factorr   s	            r&   get_quality_confidencez(EmbeddingStrategy.get_quality_confidence  s   **+;SA ==,,-DcJ HOtU]G^CCdgFMdT\F]dkkBBcfFMdT\F]dkkBBcgELTS[E\t{{AAbg""'7.'H #(
   #%(
  )NS,@L+PP

  (#-Jr%   r   c                   K   ddl m} g }g }|D ]Z  }t        |d      r"|j                  r|j                  j                  nd}|s6|j                  |dd        |j                  |       \ |sy| j                         } |||| j                         d{   }	|j                  &|	|_        t        t        t        |	                  }
ng }g }
t        |	      D ]  \  }}|t        j                  j                  |      z  }|j                  t        j                  j                  |j                  dd      z  }t        j                   ||      }t        j"                  |      | j$                  k  s|j                  |       |
j                  |        |r9t        j&                  |j                  t        j(                  |      g      |_        |
D ]*  }|j*                  j                  ||   j,                         , d| _        d| _        t        |d	      rQ|j2                  D| j5                  |j2                  t        | d
      r| j6                  j8                  nd      |_        yyy7 ڭw)z9Update embeddings and coverage metrics with deduplicationr   rX  r_   r   Ni  Tr]  r2   r   r   )rZ  rY  rf   r_   r`   r   rU  r6   r1   rA   r   r   r  rX   rb  rc  rd  r   rN  vstackrY   r0   rd   rL  rK  r2   r  r   r   r4   )rJ   r[   r   rY  	new_textsvalid_resultsr6  re   r   r  deduplicated_indicesdeduplicated_embeddingsr  new_embnew_emb_normalizedkb_normalizedr  idxs                     r&   r   zEmbeddingStrategy.update_state  sX    . 	! 	-F6=fj6QV\VeVefoo22kmG  $0$$V,		-   $BBD29>RTXThThii &"0E#'c..A(B#C  ')##% '7 	3
7%,ryy~~g/F%F" % 3 3biinnUEXEX_`kon6p p!vvm5GH 66,'$*G*GG+227;(//2	3 '&(ii1D1DbhhOfFg0h&i# ( 	=C$$]3%7%;%;<	= $( &*# 5,-%2H2H2T#'#>#>u?U?Ux  AE  GO  yPW[WbWbWtWt  VY  $ZE  3U-I js!   AI3AI3I0CI36C;I3)NN)r   )r   )r   )"rr   rs   rt   r|   rg   r   r   r   ro   rU  r   r
   r[  rj  ro  r#   r  r~   r  r  r	   r  r   r  r   r   r   r   r  r   r   r  r   r   r$   r%   r&   rE  rE  i  s   I- -iQUo@V -
 

49 
 
 S UX 4+C +PS +X[ +$Q/C Q/c Q/SV Q/f3 u 2
 
T#Y 
 s tTYZ]_dZdTeOf <sFdsF 5e$%sF 	sF
 
eD%K 	!sFj
 u H
j 
. 
TRWX\^cXcRdMe 
>"%Z "%E "%H1z 1> 1d 1fJ 5 68Z
 8ZkAR 8ZW[ 8Zr%   rE  c            	          e Zd ZdZ	 	 	 d!dee   dee   dee   fdZde	defd	Z
	 d"d
e	de	dee	   defdZde	de	dee   fdZdeeeef      de	dee   fdZedefd       Zedee	ef   fd       Zedefd       Zd#deddfdZde	fdZd$dee	ef   de	ddfdZdee	ef   fdZ d$dee	ef   de	ddfdZ!dee	ef   fdZ"d%de#deee	ef      fd Z$y)&AdaptiveCrawlerz<Main adaptive crawler that orchestrates the crawling processNcrawlerr   r   c                     || _         |xs
 t               | _        | j                  j                          |r|| _        n*| j                  | j                  j                        | _        d | _        |d u | _        y r!   )r  r   r   r   r   _create_strategyr[   _owns_crawler)rJ   r  r   r   s       r&   ro   zAdaptiveCrawler.__init__  si     0 0 $DM 11$++2F2FGDM ,0
 %_r%   strategy_namerR   c                     |dk(  r
t               S |dk(  rHt        | j                  j                  | j                  j                        }| j                  |_        |S t        d|       )z&Create strategy instance based on namer   r  )r6   rF  zUnknown strategy: )r   rE  r   r6   r   
ValueError)rJ   r  r   s      r&   r  z AdaptiveCrawler._create_strategy  sd    M)&((k)( $ ; ;;;;;H #kkHOO1-ABBr%   	start_urlr   resume_fromc           	        K   |r,t         j                  |      | _        || j                  _        nt        t	               g g |i       | _        | j
                  s1t               | _        | j
                  j                          d{    | j                  | j                  _        t        | j                  t              r|s| j                  j                  || j                  j                         d{   \  }}|| j                  _        |dd | j                  _        | j                  j                   | j                  _        	 || j                  j"                  vr| j%                  ||       d{   }|rt'        |d      r|j(                  r| j                  j*                  j-                  |       | j                  j"                  j/                  |       t'        |d      r|j0                  rt        |j0                  t2              r|j0                  j5                  dg       D cg c]  }t7        di | }}|j0                  j5                  dg       D cg c]  }t7        di | }	}| j                  j8                  j;                  ||	z          nP| j                  j8                  j;                  |j0                  j<                  |j0                  j>                  z          | j                  jA                  | j                  |g       d{    d}
|
| j                  jB                  k  r| j                  jE                  | j                         d{   }|| j                  jF                  d	<   | j                  jI                  | j                  | j                         d{   rn| j                  jK                  | j                  | j                         d{   }|sn|d   d   | j                  jL                  k  rn|d| j                  jN                   D cg c]+  \  }}|jP                  | j                  j"                  vr||f- }}}|snR| jS                  ||       d{   }|r| j                  j*                  j;                  |       tU        ||      D ]W  \  }\  }}|s| j                  j"                  j/                  |jP                         t'        |d      sI|j0                  sVg }t        |j0                  t2              rj|j0                  j5                  dg       D cg c]  }t7        di | }}|j0                  j5                  dg       D cg c]  }t7        di | }	}||	z   }n-|j0                  j<                  |j0                  j>                  z   }|D ]J  }|jP                  | j                  j"                  vs&| j                  j8                  j-                  |       L Z | j                  jA                  | j                  |       d{    |
dz  }
| j                  jV                  rE| j                  jX                  r/| j                  j[                  | j                  jX                         |
| j                  jB                  k  r| j                  jE                  | j                         d{   }t        | j                  t              r=| j                  j]                  | j                        | j                  jF                  d	<   n|| j                  jF                  d	<   t_        | j                  j"                        | j                  jF                  d
<   |
| j                  jF                  d<   | j                  jV                  rE| j                  jX                  r/| j                  j[                  | j                  jX                         | j                  | j`                  r3| j
                  r&| j
                  jc                  ddd       d{    S S S 7 7 _7 c c}w c c}w 7 H7 7 7 }c c}}w 7 c c}w c c}w 7 D7 7 ?# | j`                  r4| j
                  r'| j
                  jc                  ddd       d{  7   w w w xY ww)z&Main entry point for adaptive crawling)r   r   r   r   r   Nr   successra   internalexternalr   r   pages_crawleddepth_reachedr$   )2r   rU   r[   r   r,   r  r   
__aenter__r   r   r   rE  r  r   r2   r3   r6   r   _crawl_with_previewrf   r  r   r   r5  ra   rD   rW   r   r   extendr  r	  r   r   r   r   r   r   r   r   r  _crawl_batchr  r   r   rQ   r  r   r   	__aexit__)rJ   r  r   r  r2   r3   r6  rL   internal_linksexternal_linksdepthr   ranked_linksr  to_crawlr   r  	new_links	link_datanew_linkr  s                        r&   digestzAdaptiveCrawler.digest  s     #5DJ$DJJ# U! DJ ||*,DL,,))+++#{{ dmm%677;}}7]7]..8 2.. +;DJJ'*:12*>DJJ')-)F)FDJJ&l	?

 7 77#77	5IIgfi8V^^JJ--44V<JJ++//	:vw/FLL%fllD9GM||GWGWXbdfGg-htdlTl-hN-hGM||GWGWXbdfGg-htdlTl-hN-h JJ44;;N^<[\ !JJ44;;FLL<Q<QTZT`T`TiTi<ij --44TZZ&JJJ E$++///#'==#E#Edjj#QQ
3=

""<0 224::t{{KKK &*]]%=%=djj$++%VV#  ?1%(F(FF >JJb4;;KbKb=c HkdE"iitzz/F/FF "5M H H   %)$5$5h$FFJJ--44[A .1h-G R)	q! JJ3377		B&vw7FLL,.	#-fllD#AY_YeYeYiYijtvxYy5zId6GY6G5zN5zY_YeYeYiYijtvxYy5zId6GY6G5zN5z0>0OI 170E0EH]H]0]I 1: !RH'/}}DJJ<S<S'S(,

(@(@(G(G(Q!R!R* --44TZZMMM
 ;;))dkk.D.DJJOODKK$:$:;{ $++///@ $(==#E#Edjj#QQN $--):;37==3W3WX\XbXb3c

""<0 4B

""<025djj6M6M2NDJJ/27DJJ/ {{%%$++*@*@

 6 67:: !!dllll,,T4>>> '3!u ,2 J .i-h K R L  WH G 6{5z N R* ? !!dllll,,T4>>> '3!sP  BcaA.c5a6Ac.b 6a7B<b 3a b %a7B%b a$Ab %a'&Ab 6a*79b 0a-1Ab 60a0&b a6Ab ;b b :b a9 b =a>Ab ,Ab ?b A=b ?(b 'b(D)b 8c	b	
ccb b 'b *b -b 0b 9b b 	c9ccccrd   c                   K   t        t        dd|dddd      d      }	 | j                  j                  ||       d{   }t	        |d	      r|j
                  r|j
                  d
   }t	        |d      rH|j                  r<|j                  d   D cg c]  }|j                  d      s| c}|j                  d<   |S 7 c c}w # t        $ r}t        d| d|        Y d}~yd}~ww xY ww)z%Crawl a URL with link preview enabledTFr   2   )include_internalinclude_externalr   concurrencytimeout	max_linksverbose)link_preview_configscore_links)rd   r   N_resultsr   ra   r  r%  zError crawling z: )
r   r   r  arunrf   r$  ra   rW   r  r4  )rJ   rd   r   r   r6  rL   es          r&   r  z#AdaptiveCrawler._crawl_with_preview  s     ! 1!%!&! 
	<<,,V,DDFvz*v+ vw'FLL=C\\*=U+oTY]YaYabmYnD+oZ( M E ,p
  	OC51#./	sY   C1 C
 CAC
 C-C1C
 C1C
 C
 
	C.C)$C1)C..C1links_with_scoresc                   K   g }|D ]2  \  }}| j                  |j                  |      }|j                  |       4 t        j                  |ddi d{   }g }|D ]  }	t        |	t              rQt        |	d      r|	j                  r|j                  |	       =t        dt        |	d      r|	j                  nd        dt        |	t              sut        d|	         |S 7 w)	zCrawl multiple URLs in parallelreturn_exceptionsTNr  zSkipping failed crawl: rd   unknownzError in batch crawl: )r  r  r   asynciogatherr   r   rf   r  r4  rd   r  )
rJ   r'  r   tasksrL   r  taskresultsr  r6  s
             r&   r  zAdaptiveCrawler._crawl_batch  s     , 	KD%++DIIu=DLL	  FFF  	9F&+.69-&..!((03'&RWBXFJJ^g3hijFI..vh78	9  Gs   AC&C$A<C&C&c                 h    | j                   r&| j                   j                  j                  dd      S y)zCurrent confidence levelr   r   )r[   r   rW   r   s    r&   r   zAdaptiveCrawler.confidence  s+     ::::%%)),<<r%   c                 r   | j                   si S t        d | j                   j                  D              }t        | j                   j                        |t        | j                   j
                        t        | j                   j
                  j                               t        | j                   j                        | j                  | j                   j                  j                  dd      | j                   j                  j                  dd      | j                   j                  j                  dd      d	S )zDetailed coverage statisticsc              3   b   K   | ]'  }t        |j                  j                  xs d        ) yw)r   N)r   r_   r`   )r  r6  s     r&   r  z1AdaptiveCrawler.coverage_stats.<locals>.<genexpr>  s,      #
 ,,23#
s   -/r   r   r   r   )	r
  total_content_lengthunique_termstotal_termsr   r   r   r   r   )r[   r   r   r   r   r(   r   r   r   r   rW   )rJ   r3  s     r&   coverage_statszAdaptiveCrawler.coverage_stats  s     zzI" #
**33#
  
 !!8!89$8

 ; ;<tzz::AACD !9!9://

**..z3?::--11-E**,,00sC

 
	
r%   c                     t        | j                  t              r| j                  j                  S | j                  | j
                  j                  k\  S )z(Check if current knowledge is sufficient)r   r   rE  rJ  r   r   r   r   s    r&   is_sufficientzAdaptiveCrawler.is_sufficient  s>     dmm%67==333 ??dkk&F&FFFr%   detailedc                      j                   st        d       y	 ddlm} ddlm}  |       }d}|s(|r% d j                   j                   d	
      }|j                  ddd       |j                  dd        j                  }|j                  dt        |j                  dd                   |j                  dt        |j                  dd                   |j                  dt        |j                  dd                   |j                  d|j                  dd      dd       |j                  dt        |j                  dd                   |j                  dd       t         j                  t              r|j                  d|j                  dd      d        |j                  d! j                   j                   j                  d"d      d#       |j                  d$ j                   j                   j                  d%d      d&       |j                  d' j                   j                   j                  d(d      d        |j                  dd       |j                  d) j"                  rd*nd+       n|j                  d|j                  dd      d        |j                  d,|j                  d-d      d        |j                  d.|j                  d/d      d        |j                  d0|j                  d1d      d        |j                  dd       |j                  d) j"                  rd2nd+       j                  |       yt        d3       t        d4 j                   j                   d	       t        d5       t        d6       t        d7t%         j                   j&                                t        d8t%         j                   j(                                t        d9 j                   j*                          t-         fd: j                   j.                  D              }t-         j                   j0                  j3                               }	t%         j                   j0                        }
t        d;       t        d<|dd=       t        d>|	d       t        d?|
d       |	dkD  rt        d@|
|	z  d        t         j                  t              rt        dA       t        dB j                   j                   j                  d"d      d#       t        dC j                   j                   j                  d%d      d&       t        dD j                   j                   j                  dEd      d&       t        dF        j"                  r>|r!j                  dG j4                  d dH       nXt        dG j4                  d dI       n=|r!j                  dG j4                  d dJ       nt        dG j4                  d dK       t        dL j                   j                   j                  dMd      d        t        dN j                   j                   j                  d(d      d        ngt        dO        j                  j7                   j                   j                  j9                               }|D ]  } j                   j0                  j                  |d      } j                   j:                  j                  |d      }|dkD  r|rNj                  dP| dQ| dR j                   j*                   dS| j                   j*                  z  dTdU| dV       t        dP| dQ| dR j                   j*                   dW| j                   j*                  z  dTdX| dV       |rj                  dP| dY       t        dP| dZ        t        dF        j"                  rd[nd\}|r3 j"                  rd]nd^}j                  dG j4                  d d_|        nt        dG j4                  d d_|        t        d` j                   j                   j                  d-d      d        t        da j                   j                   j                  d/d      d        t        db j                   j                   j                  d1d      d         j                   j<                  rt-         j                   j<                        t%         j                   j<                        z  }t        dc       t        dd|d&       t        de j                   j                   j                  d1d      d        |rt        df       |rj                  dg       nt        dh       t        di       t        dj       t?         j                   j0                  jA                         dk dl      ddm }tC        |dn      D ]i  \  }\  }} j                   j:                  j                  |d      }|r j                  do|dpdq| dr| ds| dt	       Qt        do|dpdu| dv| ds| dt	       k t        dwt%         j                   j&                         dx       tC         j                   jD                  dn      D ]  \  }}|t%         j                   j<                        k  r j                   j<                  |dnz
     nd}|r.j                  do| dy| dz       |j                  d{| d|       vt        do| d}|        t        d~| d        t        d       i } j                   j:                  j3                         D ]  }|j                  |d      dnz   ||<    t?        |jG                               D ]  }||   }t        d| d| d         j                   jH                  rbt        d       t        d j                   jH                          t        dt%         j                   jJ                                 j                   jL                  -t        d j                   jL                  jN                          nt        d       t        dt%         j                   jP                                t        d j4                  d         j                   jJ                  rbt        d       tC         j                   jJ                  dd dn      D ]1  \  }}|rj                  do| d| d       !t        do| d}|        3 t        d3       y# t        $ r d}Y Vw xY w)zPrint comprehensive statistics about the knowledge base
        
        Args:
            detailed: If True, show detailed statistics including top terms
        zNo crawling state available.Nr   )Console)TableTFzAdaptive Crawl Stats - Query: '')r  Metriccyan)styleno_wrapValuemagenta)r@  zPages Crawledr
  zUnique Termsr4  zTotal Termsr5  zContent Lengthr3  ,z charszPending Linksr   r   
Confidencer   z.2%zAvg Min Distanceavg_min_distancez.3fzAvg Close Neighborsavg_close_neighborsz.1fzValidation Scorer  zIs Sufficient?z[green]Yes (Validated)[/green]z[red]No[/red]Coverager   Consistencyr   
Saturationr   z[green]Yes[/green]zQ
================================================================================z$Adaptive Crawl Statistics - Query: 'zP================================================================================z
[*] Basic Statistics:z  Pages Crawled: z  Pending Links: z  Total Documents: c              3   R   K   | ]  }t        j                  |               y wr!   )r   _get_content_from_result)r  r6  rJ   s     r&   r  z.AdaptiveCrawler.print_stats.<locals>.<genexpr>P  s)      ' D11&9:'s   $'z
[*] Content Statistics:z  Total Content: z charactersz  Total Words: z  Unique Terms: z  Vocabulary Richness: z 
[*] Semantic Coverage Analysis:z  Average Min Distance: z  Avg Close Neighbors (< 0.3): z$  Avg Very Close Neighbors (< 0.2): avg_very_close_neighborsz
[*] Confidence Metrics:z  Overall Confidence: z [green][VALIDATED][/green]z [VALIDATED]z [red][NOT VALIDATED][/red]z [NOT VALIDATED]z  Learning Score: r  z  Validation Score: z
[*] Query Coverage:z  'z': found in /z docs ([green]z.0%z[/green]), z occurrencesz docs (z), z': [red][X] not found[/red]z': [X] not foundz[OK]z[!!]z[green][OK][/green]z[red][!!][/red]r  z  Coverage Score: z  Consistency Score: z  Saturation Score: z
[*] Crawl Efficiency:z  Avg New Terms per Page: z  Information Saturation: zQ
--------------------------------------------------------------------------------z*[bold cyan]DETAILED STATISTICS[/bold cyan]zDETAILED STATISTICSzP--------------------------------------------------------------------------------z
[+] Top 20 Terms by Frequency:c                     | d   S r  r$   r  s    r&   r'   z-AdaptiveCrawler.print_stats.<locals>.<lambda>  s    VWXYVZ r%   r  r   r   z  2dz. [yellow]'z'[/yellow]: z occurrences in z docsz. 'z': z
[+] URLs Crawled (z):z. [cyan]z[/cyan]z     -> Added [green]z[/green] new termsz. z     -> Added z
 new termsz%
[+] Document Frequency Distribution:z  Terms in z docs: z termsz 
[+] Semantic Coverage Analysis:z  Embedding Model: z  Query Variations: z  Knowledge Embeddings: z  Knowledge Embeddings: Nonez  Semantic Gaps: z  Coverage Achievement: z
[+] Query Space (samples):r   z
. [yellow]z	[/yellow]))r[   r4  rich.consoler;  
rich.tabler<  ImportErrorr   
add_columnr6  add_rowrg   rW   r   r   rE  r   r8  r   r   r   r.   r   r   r(   r   r   r   r   r*   r/   r  rE   r  r0   r*  r6   r3   r1   r`  r5   )rJ   r9  r;  r<  consoleuse_richtablestatsr3  total_wordsr4  r   r   r   r   statusstatus_coloredavg_new_terms	top_termsr  freqrd   r,  	df_countscounteqs   `                         r&   print_statszAdaptiveCrawler.print_stats  sU    zz01	,(iGH H"A$**BRBRASST UVEXVTBWI6 ''EMM/3uyy!/L+MNMM.#eii.J*KLMM-UYY}a-H)IJMM*uyy9OQR/STU.VV\,]^MM/3uyy!/L+MNMM"b! $--):;luyyq/I#.NP0TZZ5G5G5K5KL^`a5bcf4gi3

8J8J8N8NOdfg8hil7mo0TZZ5G5G5K5KLcef5ghk4lnb"%.TXTfTf0Pl{| luyyq/I#.NPjUYYz1-Ec,JLm		-0KC/PRluyyq/I#.NPb"%.HZHZ0D`opMM%  - 89I9I8J!LM&M +,%c$***A*A&B%CDE%c$***B*B&C%DEF'

(B(B'CDE $' '"jj77' $  djj99@@BCKtzz::;L-/%&:1%=[IJOK?34$\!$456Q/[0H/MNO $--):;9;01C1C1G1GHZ\]1^_b0cde7

8J8J8N8NOdfg8hil7mno<TZZ=O=O=S=STnpq=rsv<wxy 13%%(>ts>SSn&op 6ts6K<XY(>ts>SSn&op 6ts6KK[\]*4::+=+=+A+ABRTU+VWZ*[\],TZZ-?-?-C-CD[]^-_`c,def -/"mm55djj6F6F6L6L6NO' @D4488qAB88<<T1EBAv##MMCv\"QtzzGaGaFbbpqstxt~t~  uO  uO  rO  PS  qT  T_  `b  _c  co  +p  q!Cv\"Qtzz?Y?Y>ZZabdeieoeoeeb  AD  bE  EH  IK  HL  LX  #Y  Z##MMCv5P*QR!Cv-=">?@ 13#'#5#56>B>P>P%:VgNMM$:4??3:OqQ_P`"ab24??32GqQR*4::+=+=+A+A*a+PQT*UVW-djj.@.@.D.D]TU.VWZ-[\],TZZ-?-?-C-CLRS-TUX,YZ[ zz++ #DJJ$@$@ AC

HdHdDe e/12=2EFG24::3E3E3I3I,XY3Z[^2_`am$MM"NO/0f 89"4::#>#>#D#D#FNdhijmkmn	'0A'> VOA|d88<<T1EB1R&D6dVScdfcggl&mn1R&D6TF:J2$eTUV ,S1H1H-I,J"MN'

(>(>B FFAsEF#djjNjNjJkEk

 < <QqS AqrI1#XcU'&BC(=i[HZ&[\1#Ruo.ykDEF >?	**99@@B =B$-MM"a$81$<IbM= !!12 BB%bMEKt75'@AB
 ::--=>/

0J0J/KLM0TZZ5P5P1Q0RSTzz//; 89Q9Q9W9W8XYZ <>-c$**2J2J.K-LMN4T__S4IJK zz22<=%.tzz/J/J2A/NPQ%R 6EAr' '1#Zt9.M N %1#Rtn 5	6 - s  	H	s   u8 8vvc                     t        |d      rQ|j                  rEt        |j                  d      r|j                  j                  xs dS t        |j                        S y)z,Helper to safely extract content from resultr_   r`   r   )rf   r_   r`   rg   )rJ   r6  s     r&   rL  z(AdaptiveCrawler._get_content_from_result  sG    6:&6??v7339r9v''r%   filepathformatc                 $   | j                   r| j                   j                  st        d       yt        |      }|j                  j                  dd       |dk(  rt        |dd      5 }| j                   j                  D ]<  }| j                  |      }|j                  t        j                  |d	
      dz          > 	 ddd       t        dt        | j                   j                         d|        yt        d|       # 1 sw Y   FxY w)zExport the knowledge base to a file
        
        Args:
            filepath: Path to save the file
            format: Export format - currently supports 'jsonl'
        zNo knowledge base to export.NTr9   jsonlr<   utf-8encodingF)ensure_ascii
z	Exported z documents to zUnsupported export format: )r[   r   r4  r   r?   r@   rG   _crawl_result_to_export_dictwriterH   dumpsr   r  )rJ   re  rf  rP   r6  result_dicts         r&   export_knowledge_basez%AdaptiveCrawler.export_knowledge_base  s     zz!:!:01>dT:Whg6 P!"jj77 PF"&"C"CF"KKGGDJJ{G$NO	PP Ic$**";";<=^H:VW:6(CDDP Ps   *ADDc                    t        |dd      t        |dd      t        |dd      | j                  r| j                  j                  ndd}t        |d      rU|j                  rIt        |j                  d	      r|j                  j
                  |d
<   nt        |j                        |d
<   nd|d
<   t        |d      r|j                  |d<   t        |d      r|j                  |d<   | j                  r|d   | j                  j                  v r+| j                  j                  j                  |d         dz   nd| j                  j                  j                  dd      | j                  j                  d|d<   |S )z.Convert CrawlResult to a dictionary for exportrd   r   	timestampNr  T)rd   rt  r  r   r_   r`   re   rb   ra   r   r   r   )r0   confidence_at_crawlr.   crawl_metadata)r   r[   r   rf   r_   r`   rg   rb   ra   r0   indexr   rW   r.   )rJ   r6  export_dicts      r&   rn  z,AdaptiveCrawler._crawl_result_to_export_dict  s^    65"- d;vy$7)-TZZ%%	
 6:&6??v7)/)E)EI&),V__)=I&%'K	" 6:&&,ooK
# 67##)<<K  ::WbchWimqmwmw  nD  nD  XDtzz55;;K<NORSS  JK'+zz'9'9'='=lA'N#'::#=#=-K() r%   c                    t        |      }|j                         st        d|       |dk(  rg }t        |dd      5 }|D ]J  }|j	                         st        j                  |      }| j                  |      }|j                  |       L 	 ddd       | j                  st               | _	        | j                  j                  j                  |       t        j                  | j                  j!                  | j                  |             t#        dt%        |       d|        yt'        d	|       # 1 sw Y   xY w)
zImport a knowledge base from a file
        
        Args:
            filepath: Path to the file to import
            format: Import format - currently supports 'jsonl'
        zFile not found: rh  rT   ri  rj  Nz	Imported z documents from zUnsupported import format: )r   existsFileNotFoundErrorrG   r  rH   rv  _import_dict_to_crawl_resultr   r[   r   r   r  r+  runr   r   r4  r   r  )rJ   re  rf  imported_resultsrP   linedatamock_results           r&   import_knowledge_basez%AdaptiveCrawler.import_knowledge_base!  s     > #&6xj$ABBW!hg6 =! =Dzz|#zz$/&*&G&G&M(//<== ::'\
 JJ%%,,-=> KK224::?OPQIc"2344DXJOP:6(CDD)= =s   D:9D::Er  c                 @     G d d       G fdd      } ||      S )z0Convert imported dict back to a mock CrawlResultc                       e Zd Zd Zy)BAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockMarkdownc                     || _         y r!   rm   rn   s     r&   ro   zKAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockMarkdown.__init__G  rp   r%   Nrq   r$   r%   r&   ru   r  F  rv   r%   ru   c                       e Zd Z fdZy)EAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockCrawlResultc                 $   |j                  dd      | _         |j                  dd            | _        |j                  di       | _        |j                  di       | _        |j                  dd      | _        |j                  d      | _        y )	Nrd   r   re   ra   rb   r  Trt  )rW   rd   r_   ra   rb   r  rt  )rJ   r  ru   s     r&   ro   zNAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockCrawlResult.__init__K  sr    88E2. ,TXXi-D E!XXgr2
 $R 8#xx	48!%+!6r%   Nrq   rz   s   r&   r{   r  J  s    7r%   r{   r$   )rJ   r  r{   ru   s      @r&   r|  z,AdaptiveCrawler._import_dict_to_crawl_resultD  s$    	, 	,	7 	7 t$$r%   top_kc                 n   | j                   r| j                   j                  sg S g }t        | j                   j                  j	                         j                               }t        | j                   j                        D ]  \  }}|j                  j                  xs dj	                         }t        |j                               }t        ||z        }|r|t        |      z  nd}	|j                  |j                  |	|j                  j                  |d        |j                  d d       |d| S )z'Get most relevant content for the queryr   r   )rd   r  re   rw  c                     | d   S )Nr  r$   r  s    r&   r'   z6AdaptiveCrawler.get_relevant_content.<locals>.<lambda>n  s
    qz r%   Tr  N)r[   r   r,   r   r   r?  r  r_   r`   r   r   rd   r  )
rJ   r  scored_docsr   r  r6  re   content_termsr  r  s
             r&   get_relevant_contentz$AdaptiveCrawler.get_relevant_contentU  s
   zz!:!:I $****00288:;"4::#<#<= 	IAv339r@@BG0M +56G2=Gc+..3Ezz!??77	  	  	14@6E""r%   )NNNr!   )F)rh  )r   )%rr   rs   rt   r|   r   r   r   r   ro   rg   r  r   r  r   r  r   r	   r   r~   r  r   r   r   r
   r6  r   r8  rc  rL  r   r   rr  rn  r  r|  r#   r  r$   r%   r&   r  r    s   F 7;4859-"?3-!.1- $M2-(Cc Cm C" =AP?*-P?&)P? -5SMP? FPP?dS  +AV @DtU{9K4L UX ]abm]n 0 E   
S#X 
 
, Gt G GI!D I!T I!V# EeCI.> E EZ^ E6#d38n #J!EeCI.> !E !EZ^ !EF%c3h %"## #d4S>6J #r%   r  )+r|   abcr   r   typingr   r   r   r   r	   r
   r   dataclassesr   r   r+  picklerR  rH   r   collectionsr   r   r=  pathlibr   crawl4ai.async_webcrawlerr   crawl4ai.async_configsr   r   r   crawl4ai.modelsr   r   numpyrX   r   r   r   r   rE  r  r$   r%   r&   <module>r     s    $ ? ? ? (   	   , 	  5 Q Q - 
{
 {
 {
| z
 z
 z
zC 0w/- w/t	K
Z K
Z\x	# x	#r%   