
    /iF                    V   d Z ddlmZmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlmZmZ ddlZddlZddlZddlZddlZddlmZmZ ddlZddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z! e G d d             Z"e G d d             Z# G d de      Z$ G d de$      Z% G d de$      Z& G d d      Z'y)a  
Adaptive Web Crawler for Crawl4AI

This module implements adaptive information foraging for efficient web crawling.
It determines when sufficient information has been gathered to answer a query,
avoiding unnecessary crawls while ensuring comprehensive coverage.
    )ABCabstractmethod)DictListOptionalSetTupleAnyUnion)	dataclassfieldN)defaultdictCounter)Path)AsyncWebCrawler)CrawlerRunConfigLinkPreviewConfig)LinkCrawlResultc                      e Zd ZU dZ ee      Zee   e	d<    ee
      Zee   e	d<    ee
      Zee   e	d<   dZee	d<    ee      Zeeef   e	d<    ed	       Zeeef   e	d
<    ed       Zeeef   e	d<    ed       Zeeee   f   e	d<   dZee	d<    ee
      Zee   e	d<    ee
      Zee   e	d<   dZee   e	d<   dZee   e	d<    ee
      Z ee   e	d<   dZ!ee   e	d<    ee
      Z"ee#ee   ef      e	d<   dZ$ee	d<   de%ee&f   fdZ'e(de%ee&f   dd fd       Z)e*dedefd       Z+e*d efd!       Z,y)"
CrawlStatez-Tracks the current state of adaptive crawling)default_factorycrawled_urlsknowledge_basepending_links querymetricsc                       t        t              S Nr   int     Z/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/adaptive_crawler copy.py<lambda>zCrawlState.<lambda>$   s    [QTEU r$   term_frequenciesc                       t        t              S r    r!   r#   r$   r%   r&   zCrawlState.<lambda>%   s    UXIY r$   document_frequenciesc                       t        t              S r    )r   setr#   r$   r%   r&   zCrawlState.<lambda>&   s    kZ]N^ r$   documents_with_termsr   total_documentsnew_terms_historycrawl_orderNkb_embeddingsquery_embeddingsexpanded_queriescoverage_shapesemantic_gapsembedding_modelpathc           
         t        |      }|j                  j                  dd       i dt        | j                        d| j
                  D cg c]  }| j                  |       c}d| j                  D cg c]  }|j                          c}d| j                  d| j                  dt        | j                        d	t        | j                        d
| j                  j                         D ci c]  \  }}|t        |       c}}d| j                   d| j"                  d| j$                  d| j&                  | j&                  j)                         ndd| j*                  | j*                  j)                         ndd| j,                  d| j.                  d| j0                  }t3        |d      5 }t5        j6                  ||d       ddd       yc c}w c c}w c c}}w # 1 sw Y   yxY w)z"Save state to disk for persistenceTparentsexist_okr   r   r   r   r   r'   r)   r,   r-   r.   r/   r0   Nr1   r2   r4   r5   w   )indent)r   parentmkdirlistr   r   _crawl_result_to_dictr   
model_dumpr   r   dictr'   r)   r,   itemsr-   r.   r/   r0   tolistr1   r2   r4   r5   openjsondump)selfr6   crlinkkv
state_dictfs           r%   savezCrawlState.save5   s   Dz$6
D!2!23
H[H[\"t99"=\
 D<N<NODdoo/O
 TZZ	

 t||
 T%:%: ;
 #D)B)B$C
 #D<U<U<[<[<]$^DAqQQZ$^
 t33
  !7!7
 4++
 D<N<N<ZT//668`d
 $BWBWBc 5 5 < < >im
  5 5
  T//!
" t33#

( $_ 	/IIj!A.	/ 	/% ]O
 %_	/ 	/s   G 9G%7G*
>G00G9returnc                    t        |      }t        |d      5 }t        j                  |      }ddd        |        }t	        d         |_        |d   D cg c]  }| j                  |       c}|_        |d   D cg c]  }t        di | c}|_	        |d   |_
        |d   |_        t        t        |d         |_        t        t        |d	         |_        t        t        |d
   j!                         D ci c]  \  }}|t	        |       c}}      |_        |d   |_        |d   |_        |d   |_        ddl}	|j-                  d      |	j/                  |d         nd|_        |j-                  d      |	j/                  |d         nd|_        |j-                  dg       |_        |j-                  dg       |_        |j-                  dd      |_        |S # 1 sw Y   xY wc c}w c c}w c c}}w )zLoad state from diskrNr   r   r   r   r   r'   r)   r,   r-   r.   r/   r   r0   r1   r2   r4   r5   r   r#   )r   rF   rG   loadr+   r   _dict_to_crawl_resultr   r   r   r   r   r   r"   r'   r)   rD   r,   r-   r.   r/   numpygetarrayr0   r1   r2   r4   r5   )
clsr6   rO   rN   stated	link_dictrL   rM   nps
             r%   rT   zCrawlState.loadR   s    Dz$_ 	&1J	&  N!;<FPQaFbc 9 9! <cBL_B]^Yt0i0^ )"9-!,S*=O2P!Q%0jAW6X%Y"%0ZXnMoMuMuMw6xTQq#a&y6x%y" *+< =",-@"A&}5 	GQ~~VeGfGrbhhz/'BCx|MW^^\nMoM{*5G*H!I  BF!+0BB!G(nn_bA */@" E1	& 	&
  d^
 7ys   G G->G2<G7 G*rJ   c                 @   d}t        | d      rN| j                  rBt        | j                  d      r| j                  j                  }nt        | j                        }| j                  |t        | d      r| j
                  ni t        | d      r| j                  dS i dS )z(Convert CrawlResult to serializable dictr   markdownraw_markdownlinksmetadataurlcontentra   rb   )hasattrr_   r`   strrd   ra   rb   )rJ   markdown_contents     r%   rA   z CrawlState._crawl_result_to_dictp   s     2z"r{{r{{N3#%;;#;#; #&r{{#3  66'!(W!5RXX2'.r:'>	
 	
 EG	
 	
r$   r[   c           	           G d d       G fdd      } || d   | j                  dd      | j                  di       | j                  d	i       
      S )z Convert dict back to CrawlResultc                       e Zd Zd Zy)6CrawlState._dict_to_crawl_result.<locals>.MockMarkdownc                     || _         y r    r`   rI   re   s     r%   __init__z?CrawlState._dict_to_crawl_result.<locals>.MockMarkdown.__init__   
    $+!r$   N__name__
__module____qualname__ro   r#   r$   r%   MockMarkdownrk          ,r$   ru   c                       e Zd Z fdZy)9CrawlState._dict_to_crawl_result.<locals>.MockCrawlResultc                 J    || _          |      | _        || _        || _        y r    )rd   r_   ra   rb   )rI   rd   re   ra   rb   ru   s        r%   ro   zBCrawlState._dict_to_crawl_result.<locals>.MockCrawlResult.__init__   s$     ,W 5"
 (r$   Nrq   ru   s   r%   MockCrawlResultrx      s    )r$   r{   rd   re   r   ra   rb   rc   )rW   )r[   r{   ru   s     @r%   rU   z CrawlState._dict_to_crawl_result   sW    	, 	,	) 	) %EE)R(%%$UU:r*	
 	
r$   )-rr   rs   rt   __doc__r   r+   r   r   rg   __annotations__r@   r   r   r   r   r   r   rC   r   r   floatr'   r"   r)   r,   r-   r.   r/   r0   r   r
   r1   r2   r3   r4   r	   r5   r   r   rP   classmethodrT   staticmethodrA   rU   r#   r$   r%   r   r      s   7"37L#c(7(-d(CND%C %d ;M4:;E3O %d ;GT#u*; (-=U'Vd38nV+0AY+Z$sCx.Z05F^0_$sCH}-_OS $)#>tCy>"48Kc8 $(M8C='&*hsm*"'"=d3i=$(NHSM(5:45PM4d5k5012POS/sDy) /: c4i( \  : 
+ 
$ 
 
" 
 
 
r$   r   c                      e Zd ZU dZdZeed<   dZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZeed <   dZee   ed!<   d"Zeed#<   d$Zeed%<   dZeed&<   dZ eed'<   d(Z!eed)<   dZ"eed*<   dZ#eed+<   d$Z$eed,<   d
Z%eed-<   dZ&eed.<   dZ'eed/<   d0Z(eed1<   d2Z)eed3<   d4 Z*y)5AdaptiveConfigz#Configuration for adaptive crawlingffffff?confidence_threshold   	max_depth   	max_pages   top_k_links皙?min_gain_thresholdstatisticalstrategy皙?saturation_thresholdconsistency_threshold皙?coverage_weight333333?consistency_weightsaturation_weight      ?relevance_weightnovelty_weight皙?authority_weightF
save_stateN
state_path&sentence-transformers/all-MiniLM-L6-v2r5   embedding_llm_config
   n_query_variations333333?coverage_thresholdalpha_shape_alphaembedding_coverage_radiusg      @embedding_k_expembedding_nearest_weightembedding_top_k_weightembedding_overlap_threshold"embedding_min_relative_improvementembedding_validation_min_score embedding_quality_min_confidenceffffff? embedding_quality_max_confidence-?embedding_quality_scale_factorc                    d| j                   cxk  rdk  sJ d        J d       | j                  dkD  sJ d       | j                  dkD  sJ d       | j                  dkD  sJ d       d| j                  cxk  rdk  sJ d        J d       | j
                  | j                  z   | j                  z   }t        |dz
        d	k  s
J d
|        | j                  | j                  z   | j                  z   }t        |dz
        d	k  s
J d|        d| j                  cxk  rdk  sJ d        J d       | j                  dkD  sJ d       d| j                  cxk  rdk  sJ d        J d       d| j                  cxk  rdk  sJ d        J d       t        | j                  | j                  z   dz
        d	k  sJ d       d| j                   cxk  rdk  sJ d        J d       d| j"                  cxk  rdk  sJ d        J d       d| j$                  cxk  rdk  sJ d        J d       d| j&                  cxk  rdk  sJ d        J d       d| j(                  cxk  rdk  sJ d        J d       | j*                  dkD  sJ d       y)z!Validate configuration parametersr      z,confidence_threshold must be between 0 and 1zmax_depth must be positivezmax_pages must be positiveztop_k_links must be positivez*min_gain_threshold must be between 0 and 1      ?gMbP?z$Coverage weights must sum to 1, got z(Link scoring weights must sum to 1, got z1embedding_coverage_radius must be between 0 and 1z embedding_k_exp must be positivez0embedding_nearest_weight must be between 0 and 1z.embedding_top_k_weight must be between 0 and 1zEmbedding weights must sum to 1z3embedding_overlap_threshold must be between 0 and 1z:embedding_min_relative_improvement must be between 0 and 1z6embedding_validation_min_score must be between 0 and 1z8embedding_quality_min_confidence must be between 0 and 1z8embedding_quality_max_confidence must be between 0 and 1z/embedding_quality_scale_factor must be positiveN)r   r   r   r   r   r   r   r   absr   r   r   r   r   r   r   r   r   r   r   r   r   )rI   
weight_sums     r%   validatezAdaptiveConfig.validate   s   D--22b4bb2b4bb2~~!?#??!~~!?#??!!#C%CC#D++0q0^2^^0^2^^0 ))D,C,CCdF\F\\
:#$u,a0TU_T`.aa,**T-@-@@4CXCXX
:#$u,e0XYcXd.ee, 4115A5j7jj5j7jj5##a'K)KK'D116Q6j8jj6j8jj6D//414f6ff4f6ff440043N3NNQTTUX]]  	A  `A  	A]D4499p;pp9p;pp94::>Q>|@||>|@||>D77<1<v>vv<v>vv<D99>Q>z@zz>z@zz>D99>Q>z@zz>z@zz>22Q6i8ii6r$   )+rr   rs   rt   r|   r   r~   r}   r   r"   r   r   r   r   rg   r   r   r   r   r   r   r   r   r   boolr   r   r5   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r$   r%   r   r      su   -"%%%IsIsK ##!Hc! #&%%#&5& OU  ##"u" "e!NE!e! J $J$ DOSC+/(4./   $$"u" (+u* !OU  '*e)$'E'
 *.-
 14&3 -0"E/
 /2$e1.2$e2,1"E1jr$   r   c            
           e Zd ZdZededefd       Zededede	e
eef      fd       Zedededefd       Zedede	e   dd	fd
       Zy	)CrawlStrategyz+Abstract base class for crawling strategiesrZ   rQ   c                    K   yw)z@Calculate overall confidence that we have sufficient informationNr#   )rI   rZ   s     r%   calculate_confidencez"CrawlStrategy.calculate_confidence         	   configc                    K   yw)z/Rank pending links by expected information gainNr#   rI   rZ   r   s      r%   
rank_linkszCrawlStrategy.rank_links  r   r   c                    K   yw)!Determine if crawling should stopNr#   r   s      r%   should_stopzCrawlStrategy.should_stop  r   r   new_resultsNc                    K   yw)#Update state with new crawl resultsNr#   )rI   rZ   r   s      r%   update_statezCrawlStrategy.update_state  r   r   )rr   rs   rt   r|   r   r   r~   r   r   r   r	   r   r   r   r   r   r   r#   r$   r%   r   r      s    5
 u   j . TRWX\^cXcRdMe   z > d   
 kAR W[  r$   r   c            	          e Zd ZdZd ZdedefdZdedefdZdedefdZ	dedefdZ
ded	edeeeef      fd
ZdededefdZdededefdZdedefdZded	edefdZdedee   ddfdZdedee   fdZdedee   fdZy)StatisticalStrategyz1Pure statistical approach - no LLM, no embeddingsc                 .    i | _         d| _        d| _        y )Ng333333?g      ?)	idf_cachebm25_k1bm25_brI   s    r%   ro   zStatisticalStrategy.__init__  s    r$   rZ   rQ   c                 
  K   |j                   sy| j                  |      }| j                  |      }| j                  |      }||j                  d<   ||j                  d<   ||j                  d<   d|z  d|z  z   d|z  z   }|S w)z@Calculate confidence using coverage, consistency, and saturation        coverageconsistency
saturationr   r   )r   _calculate_coverage_calculate_consistency_calculate_saturationr   )rI   rZ   r   r   r   
confidences         r%   r   z(StatisticalStrategy.calculate_confidence  s     ##++E211%8//6
 %-j!'2m$&0l# 8^cK&77#
:JJ
s   BBc                    |j                   r|j                  dk(  ry| j                  |j                   j                               }|syg }|j                  r#t        |j                  j                               nd}|D ]  }|j                  j                  |d      }|j                  j                  |d      }|dkD  rd||j                  z  }|dkD  r1t        j                  d|z         t        j                  d|z         z  nd}	|dd|	z  z   z  }
|j                  |
       |j                  d        t        |      t        |      z  }t        dt        j                  |            S )zCoverage scoring - measures query term presence across knowledge base
        
        Returns a score between 0 and 1, where:
        - 0 means no query terms found
        - 1 means excellent coverage of all query terms
        r   r   r   r   r   )r   r-   	_tokenizelowerr'   maxvaluesrW   r)   mathlogappendsumlenminsqrt)rI   rZ   query_termsterm_scoresmax_tftermtfdfdoc_coveragefreq_signal
term_scorer   s               r%   r   z'StatisticalStrategy._calculate_coverage-  sM    {{e33q8nnU[[%6%6%899>9O9OU++2245UV 	(D''++D!4B++//a8BAv!E$9$99 JPRSdhhq2v.!f*1EEYZ *Q{1B-BC
"":.""3'	($ {#c+&66 3		(+,,r$   c                    t        |j                        dk  ryg }t        t        |j                              D ]  }t        |dz   t        |j                              D ]  }t        | j	                  |j                  |               }t        | j	                  |j                  |               }|sT|sWt        ||z        t        ||z        z  }|j                  |         |rt        |      t        |      z  }|S d}|S )zQInformation overlap between pages - high overlap suggests coherent topic coverager<   r   r   r   )r   r   ranger+   _get_document_termsr   r   )	rI   rZ   overlapsijterms_iterms_joverlapr   s	            r%   r   z*StatisticalStrategy._calculate_consistencyV  s    u##$q( s5//01 		-A1q5#e&:&:";< -d66u7K7KA7NOPd66u7K7KA7NOPw!'G"34s7W;L7MMGOOG,-		- h-#h-7K  Kr$   c                    |j                   syt        |j                         dk  ry|j                   d   dkD  r|j                   d   nd}|j                   d   dkD  r|j                   d   nd}d||z  z
  }t        dt        |d            S )zIDiminishing returns indicator - are we still discovering new information?r   r<   r   r   r   )r.   r   r   r   )rI   rZ   recent_rateinitial_rater   s        r%   r   z)StatisticalStrategy._calculate_saturationq  s    &&u&&'!+ 6;5L5LR5PST5Te--b1Z[5:5L5LQ5ORS5Su..q1YZ +45
3J,--r$   r   c                 ^  K   g }|j                   D ]  }|j                  |j                  v r| j                  ||      }| j	                  ||      }d}|j
                  |z  |j                  |z  z   |j                  |z  z   }|j                  ||f        |j                  d d       |S w)z'Rank links by expected information gainr   c                     | d   S Nr   r#   xs    r%   r&   z0StatisticalStrategy.rank_links.<locals>.<lambda>  s
    ! r$   Tkeyreverse)
r   hrefr   _calculate_relevance_calculate_noveltyr   r   r   r   sort)	rI   rZ   r   scored_linksrK   	relevancenovelty	authorityscores	            r%   r   zStatisticalStrategy.rank_links  s     '' 	/DyyE... 11$>I--dE:GI ,,y8))G34++i78E u.!	/& 	nd;s   B+B-rK   c                 .   |j                   r|sydj                  t        d|j                  xs d|j                  xs d|j
                  r,|j
                  j                  di       j                  dd      nd|j
                  r,|j
                  j                  di       j                  dd      nd|j
                  r,|j
                  j                  di       j                  dd      ndg            j                         }|sy|j                  r|j                  d	kD  r|j                  S t        | j                  |j                   j                                     }t        | j                  |            }|syt        ||z        t        |      z  }|S )
z3BM25 relevance score between link preview and queryr    Nr   metatitledescriptionkeywordsr   )r   joinfiltertextr  	head_datarW   r   contextual_scorer+   r   r   )rI   rK   rZ   	link_textr   
link_termsr   s          r%   r  z(StatisticalStrategy._calculate_relevance  sR   {{$ HHVDIIOJJ"?C~~DNNvr*..w;SUEI^^DNNvr*..}bAY[BF..DNNvr*..z2>VX+
   EG 	    T%:%:Q%>((( $..):):)<=>	23
kJ./#k2BBr$   c                 h   |j                   sydj                  t        d|j                  xs d|j                  xs d|j
                  r|j
                  j                  dd      nd|j
                  r|j
                  j                  dd      nd|j
                  r|j
                  j                  dd      ndg            j                         }t        | j                  |            }|syt        |j                  j                               }||z
  }|rt        |      t        |      z  }|S d	}|S )
z9Estimate how much new information this link might provider   r  Nr   r  r  r  r   r   )r   r  r  r  r  r  rW   r   r+   r   r'   keysr   )rI   rK   rZ   r  r  existing_terms	new_termsr	  s           r%   r  z&StatisticalStrategy._calculate_novelty  s
   ## HHVDIIOJJ"/3~~DNNw+259^^DNN}b126..DNNz2.b+
   EG 	 	23
 U3388:;/	6@#i.3z?2 GJr$   c                 V   d}|j                   sy|j                   j                         }d|v sd|v r|dz  }d|v sd|v r|dz  }d|v sd	|v r|d
z  }|j                  d      r|d
z  }n|j                  d      r|dz  }|j                  d|z  d|j                  z  z   }t	        |d      S )zASimple authority score based on URL structure and link attributesr   r   z/docs/z/documentation/r   z/api/z/reference/z/guide/z
/tutorial/r   z.pdf)z.jpgz.pngz.gifr   r   r   )r  r   endswithintrinsic_scorer   )rI   rK   r  rd   s       r%   _calculate_authorityz(StatisticalStrategy._calculate_authority  s    yyiioo s?/36SLEc>]c1SLE|s2SLE <<SLE\\23SLE +%K#(<(<"<<E5#r$   c                   K   |j                   j                  dd      }||j                  k\  ryt        |j                        |j
                  k\  ry|j                  sy|j                   j                  dd      |j                  k\  ryyw)r   r   r   Tr   F)r   rW   r   r   r   r   r   r   )rI   rZ   r   r   s       r%   r   zStatisticalStrategy.should_stop  s      ]]&&|S9
444 u!!"f&6&66 "" ==\3/63N3NNs   BB
r   Nc                   K   |D ]P  }t        |j                        }	 |j                  j                  }| j                  |j                               }t               }|D ]*  }|j                  |xx   dz  cc<   |j                  |       , |j                  }	|D ]I  }||j                  |   vs|j                  |xx   dz  cc<   |j                  |   j                  |	       K t        |j                        }
|
|z
  }|j                  j                  |       |xj                  dz  c_        |j                   j                  |j                         S y# t        $ r t        d|j                   d       d}Y Lw xY ww)r   zWarning: CrawlResult z has no markdown contentr   r   N)r   r'   r_   r`   AttributeErrorprintrd   r   r   r+   addr-   r,   r)   r.   r   r/   )rI   rZ   r   resultold_term_countre   termsterm_setr   doc_idnew_term_countr  s               r%   r   z StatisticalStrategy.update_state	  ss    ! .	1F !7!78N //66  NN7==?3E uH #&&t,1,T"#
 **F  Au99$??..t494..t488@A !!7!78N&7I##**95 !!Q&! $$VZZ0].	1 " -fjj\9QRSs/   FEA:F0B*F$F>FFFr  c                     t        j                  dd|      }|j                         }|D cg c]  }t        |      dkD  s| }}|S c c}w )z%Simple tokenization - can be enhancedz[^\w\s]r  r<   )resubsplitr   )rI   r  tokensts       r%   r   zStatisticalStrategy._tokenize;  sJ     vvj#t, $2s1vz!22 3s   A Acrawl_resultc                 t    |j                   j                  xs d}| j                  |j                               S )z!Extract terms from a crawl resultr   )r_   r`   r   r   )rI   r2  re   s      r%   r   z'StatisticalStrategy._get_document_termsF  s-    ''44:~~gmmo..r$   )rr   rs   rt   r|   ro   r   r~   r   r   r   r   r   r   r	   r   r   r  r  r   r   r   r   r   rg   r   r   r#   r$   r%   r   r     s4   ;

 u &'- '- '-RJ 5 6.: .% ."j . TRWX\^cXcRdMe 6 j U @t J 5 4 % :z > d *01
 01kAR 01W[ 01d	c 	d3i 	/ /S	 /r$   r   c                      e Zd ZdZd"dedefdZdee   defdZ	d	ed
edefdZ
d	ed
edefdZd#dededefdZd$dedefdZd%dedee   fdZd
ed	edeeeef      fdZdee   deeeef      d
edeeeef      fdZdedefdZdededeeeef      fdZdedefdZdededefdZdedefdZded ee   ddfd!Zy)&EmbeddingStrategyz?Embedding-based adaptive crawling using semantic space coverageNr5   
llm_configc                     |xs d| _         || _        i | _        i | _        d| _        d | _        d | _        d | _        d| _        y )Nr   Fr   )	r5   r6  _embedding_cache_link_embedding_cache_validation_passed_distance_matrix_cache_kb_embeddings_hash_validation_embeddings_cache_kb_similarity_threshold)rI   r5   r6  s      r%   ro   zEmbeddingStrategy.__init__O  sQ    .Z2Z$ "%'""' '+##' ,0)(,%r$   textsrQ   c                    K   ddl m} dt        j                  d      d} |||| j                         d{   S 7 w)z&Get embeddings using configured methodr   get_text_embeddingsopenai/text-embedding-3-smallOPENAI_API_KEYprovider	api_tokenN)utilsrB  osgetenvr5   )rI   r?  rB  r   s       r%   _get_embeddingsz!EmbeddingStrategy._get_embeddings\  sI     .7#34 
 )   
 
 	
 
s   6?=?r1   r0   c                    ddl }|t        |      dk(  ryt        |j                        dk(  r|j                  dd      }t        |j                        dk(  r|j                  dd      }||j                  j                  |dd      z  }||j                  j                  |dd      z  }|j                  ||j                        }d|z
  }|S )z3Compute distance matrix using vectorized operationsr   Nr   r   Taxiskeepdims)rV   r   shapereshapelinalgnormdotT)rI   r1   r0   r]   
query_normkb_normsimilarity_matrixdistance_matrixs           r%   _compute_distance_matrixz*EmbeddingStrategy._compute_distance_matrixi  s     C$6!$; %%&!+/772>}""#q()11!R8M &		7GaZ^(__
"))..QQU."VV FF:wyy9 //r$   c                     ddl }|t        |      dk(  ry|t        |j                               nd}| j                  || j
                  k7  r| j                  ||      | _        || _        | j                  S )z Get distance matrix with cachingr   N)rV   r   hashtobytesr;  r<  rZ  )rI   r1   r0   r]   kb_hashs        r%   _get_cached_distance_matrixz-EmbeddingStrategy._get_cached_distance_matrix  s}     C$6!$; 4A3L$},,./RV''/t///*.*G*GHXZg*hD''.D$***r$   r   n_syntheticc                    K   ddl m} t        |dz        }d| d| d}| j                  r| j                  j	                  dd      nd}| j                  r| j                  j	                  d	      nd
}dg di}dg di}dd
l}	|d   j                         }
|	j                  |
       t        dt        t        |
      dz              }|
| d
 }|g|
d
|  z   }| j                  |       d
{   }|| _        ||fS 7 w)zJGenerate a point cloud representing the semantic neighborhood of the queryr   )perform_completion_with_backoffg?z	Generate z; variations of this query that explore different aspects: 'z'
        
        These should be queries a user might ask when looking for similar information.
        Include different phrasings, related concepts, and specific aspects.
        
        Return as a JSON array of strings.rF  zopenai/gpt-4o-minirG  Nqueries)z2what are the best vegetables to use in fried rice?z0how do I make vegetable fried rice from scratch?z8can you provide a quick recipe for vegetable fried rice?zMwhat cooking techniques are essential for perfect fried rice with vegetables?z*how to add flavor to vegetable fried rice?zAare there any tips for making healthy fried rice with vegetables?)z6How do async and await work with coroutines in Python?z<What is the role of event loops in asynchronous programming?zUCan you explain the differences between async/await and traditional callback methods?z:How do coroutines interact with event loops in JavaScript?zDWhat are the benefits of using async await over promises in Node.js?zECan you nest async functions and how does that affect the event loop?z4What is the performance impact of using async/await?r   r<   r   )rH  rb  r"   r6  rW   randomcopyshuffler   r   rK  _validation_queries)rI   r   r`  rb  n_totalpromptrF  rG  
variationsrd  other_queriesn_validationval_queriestrain_queriestrain_embeddingss                  r%   map_query_semantic_spacez*EmbeddingStrategy.map_query_semantic_space  s4    : kC'( wi'bchbi j+ . MQOO4??&&z3GHau8<DOO''4T	   !K  L
   "
 
& 	 #9-224}% 1c#m"4s":;<#\MN3-,"?? "&!5!5m!DD $/ .. Es   C*C>,C<-C>query_pointsalphac                 <   	 ddl }t        |      dk  ry|j                  |      }|j                  |d      |j	                  |d      ||j                  |j                  j                  ||j                  |d      z
  d            d}|S # t        $ r Y yw xY w)zEFind the minimal shape that covers all query points using alpha shaper   Nr   rN  r   )centerstdpointsradius)	rV   r   rX   meanrv  r   rR  rS  	Exception)rI   rq  rr  r]   r   s        r%   compute_coverage_shapez(EmbeddingStrategy.compute_coverage_shape  s    	< 1$
 88L1L '',Q'7vvlv3&&&rww|Z[w?\0\cd!ef	H O 		s   B A9B 	BB	n_samplesc                     ddl }|g S g S )z*Sample points from the boundary of a shaper   N)rV   )rI   rP  r|  r]   s       r%   _sample_boundary_pointsz)EmbeddingStrategy._sample_boundary_points  s     =I 	r$   c                 B   ddl }g }|t        |      dk(  r|D ]  }|j                  |df        |S | j                  ||      }||D ]  }|j                  |df        |S |j	                  |d      }t        |      D ]  \  }}|j                  |||   f        |S )zLCalculate gap distances for all query variations using vectorized operationsr   Nr   r   rt  )rV   r   r   r_  r   	enumerate)	rI   r0   r1   r]   gapsq_embrY  min_distancesr   s	            r%   find_coverage_gapsz$EmbeddingStrategy.find_coverage_gaps  s     C$6!$;) *UCL)*K ::;K][") *UCL)*K Q7 ""23 	3HAuKKa 012	3 r$   candidate_linksr  c                 z  %K   ddl m}m}m} ddl}ddl}g }	g }
g }i }|D ]E  }dj                  t        d|j                  xs d|j                  xs dt        |d      r(|j                  r|j                  j                  dd      nd|j                  r,|j                  j                  di       j                  dd      ndg            }|j                         s|j                  |j                    d	| j#                               j%                         }|| j&                  v r| j&                  |   ||j                   <   $|
j)                  |       |j)                  |       H |rd
t+        j,                  d      d} |||| j.                         d{   }t1        |
||      D ]`  \  }}}|j                  |j                    d	| j#                               j%                         }|| j&                  |<   |||j                   <   b t        | d      r| j2                  j4                  nd%|D ]p  }|j                   |vr||j                      }|sd}n6d}d}|D ].  \  }}|%kD  s |||      }||k  s||z
  }|dz  }||z  }|dz  }0 t7        %fd|D              }|dkD  r||z  }nd}|t9        |      dkD  r||j:                  j=                  |      z  }||j:                  j=                  |dd      z  } |j?                  | |      }!|jA                  |!      }"t        | d      r| j2                  jB                  nd}#|"|#kD  r	|"|#z
  dz  }$nd}$nd}$|d|$z
  z  }t        |d      r!|jD                  r|dz  |jD                  dz  z   }|	j)                  ||f       s tG        |	d d      S 7 w)z0Select links that most efficiently fill the gapsr   )cosine_distancecosine_similarityrB  r   Nr  r   r  r  :rC  rD  rE  r   r   r   r<   c              3   4   K   | ]  \  }}|kD  sd   yw)r   Nr#   ).0_r[   coverage_radiuss      r%   	<genexpr>z?EmbeddingStrategy.select_links_for_expansion.<locals>.<genexpr>s  s     'Rdaa/>Q'Rs   TrM  r   r  r   c                     | d   S r   r#   r   s    r%   r&   z>EmbeddingStrategy.select_links_for_expansion.<locals>.<lambda>  s
    !A$ r$   r   )$rH  r  r  rB  rV   hashlibr  r  r  r  rf   r  rW   r  stripmd5r  encode	hexdigestr9  r   rI  rJ  r5   zipr   r   r   r   rR  rS  rT  r   r   r  sorted)&rI   r  r  r0   r  r  rB  r]   r  r  links_to_embedtexts_to_embedlink_embeddings_maprK   r  	cache_keyr   new_embeddingsr  	embeddinglink_embeddingr  gaps_helpedtotal_improvement	gap_pointgap_distancenew_distanceimprovementscaled_improvementgaps_needing_helpgap_reduction_score	link_normrW  similaritiesmax_similarityoverlap_thresholdoverlap_penaltyr  s&                                        @r%   select_links_for_expansionz,EmbeddingStrategy.select_links_for_expansion  s     	SR  # 	1D		R

 b4;D&4Idii		mR0]_IM""62.22="E]_	/ " I ??$  tyyk9+$>$E$E$GHRRTI D666151K1KI1V#DII.%%d+%%i0)	1. ;YY'78$  $7~G[]a]q]q#rrN *-^^^)\ ;%dI#KK499+Qtf(=(D(D(FGQQS	8A**951:#DII.; DK4QYCZ$++??`c $ :	/Dyy 330;N  $%!/3 
-+I|#o5'6~y'Q',6*6*EK1<q.-1CC-'1,K
- %('Rd'R$R!$q(*;>O*O'*+' !,]1Ca1G .1O OI+biinn]QR]an.bbG $&66'9#=L%'VVL%9N T[[_aiSj(O(Opt%%(99+9<M+MQR*R*+&'O ,q?/BC 4!349N9N!CK$*?*?#*EEEu.u:	/x lEEQ ss    FN;N8CN;/N;>D;N;rZ   c                 h  K   ddl }|j                  |j                  yt        |j                        dk(  st        |j                        dk(  ry|j	                  |j                  |j
                        }|j	                  |j                  |j
                        }||j                  j                  |dd      dz   z  }||j                  j                  |dd      dz   z  }||j                  z  j                  d	      }t        | j                  d
d      }|t        ||k\  j                               nt        |j                               }||j                  d<   t        |j                               |j                  d<   t        |j                  |            |j                  d<   |S w)u&   Coverage-based learning score (0–1).r   Nr   )dtyper   TrM  g:0yE>rt  coverage_taucoverage_scoreavg_best_similaritymedian_best_similarity)rV   r0   r1   r   asarrayfloat32rR  rS  rU  r   getattrr   r~   ry  r   median)rI   rZ   r]   QDbesttaur  s           r%   r   z&EmbeddingStrategy.calculate_confidence  sm     &%*@*@*Hu""#q(C0F0F,G1,L JJu--RZZJ@JJu**"**J=	RYY^^AA^5<<	RYY^^AA^5<< ACC}}!}$ dkk>48/2ts{((*+E$))+DV +0&'/4TYY[/A+,27		$2H./s   F0F2r   c                   K   || _         t               }g }|j                  D ]V  }|j                  |j                  vs|j                  |vs+|j                  |       |j                  |j                         X |sg S | j                  |j                  |j                        }|D cg c]  }|d   j                         |d   f c}|_        | j                  |||j                         d{   S c c}w 7 	w)z!Main entry point for link rankingr   r   N)r   r+   r   r  r   r   r%  r  r0   r1   rE   r4   r  )rI   rZ   r   	seen_urlsuncrawled_linksrK   r  gs           r%   r   zEmbeddingStrategy.rank_links  s       E	'' 	)Dyy 2 22tyy	7Q&&t,dii(	)
 I &&""
 ?CC!qt4C 44
 
 	
 D
s)   ;C<C<AC<)C5(C<0C:1
C<c                 b  K   t        | d      r| j                  s|j                  j                  dd      S ddl}| j
                  (| j                  | j                         d{   | _        | j
                  }|j                  t        |j                        dk(  ry| j                  ||j                        }|y|j                  |d      }t        | d      r| j                  j                  nd	}|j                  | |z        }|j                  |      }||j                  d
<   |S 7 ǭw)z5Validate coverage using held-out queries with cachingrg  r   r   r   Nr   rt  r   r   validation_confidence)rf   rg  r   rW   rV   r=  rK  r0   r   rZ  r   r   r   expry  )	rI   rZ   r]   val_embeddingsrY  r  k_expscoresr  s	            r%   validate_coveragez#EmbeddingStrategy.validate_coverage  s    t234;S;S==$$\377 ,,46:6J6J4KcKc6d0dD-:: &#e.A.A*Ba*G 77H[H[\" Q7 07tX/F++C./ "1F-.$$1 1es   A#D/%D-&CD/c                   K   |j                   j                  dd      }t        |j                        |j                  k\  s|j
                  syt        |d      sg |_        |j                  j                  |       t        |j                        dk  ryt        t        |j                  dd |j                  d	d             }t        d
 |D              t        |      z  }||j                   d<   t        | d      r| j                  j                  |z  nd|z  }||k  rh| j                  |       d{   }t        | d      r| j                  j                  nd}||kD  rd|j                   d<   d| _        yd|j                   d<   y7 Tw)z(Stop based on learning curve convergencer   r   Tconfidence_historyr<   FNr   r   c              3   >   K   | ]  \  }}t        ||z
          y wr    )r   )r  abs      r%   r  z0EmbeddingStrategy.should_stop.<locals>.<genexpr>T  s     GTQc!a%jGs   avg_improvementr   r   r   converged_validatedstopped_reasonlow_validation)r   rW   r   r   r   r   rf   r  r   r@   r  r   r   r   r  r   r:  )	rI   rZ   r   r   improvement_diffsr  min_relative_improvement	val_scorevalidation_mins	            r%   r   zEmbeddingStrategy.should_stop?  s    ]]&&|S9
 u!!"f&6&66e>Q>Q u23')E$  ''
3 u''(1, U%=%=cr%BED\D\]^]_D`!ab G5FGG#N_J``+:'(bijnpxby4;;#Q#QT^#^  @C  FP  @P 55"44U;;I LSSWYaKbT[[GGhkN>)2G./*.'2B./  <s   D2F4F	5AFc                     |j                   j                  dd      }|j                   j                  dd      }t        | d      r| j                  j                  nd}t        | d      r| j                  j
                  nd}t        | d      r| j                  j                  nd}t        | d      r| j                  j                  nd}| j                  r$||kD  r|dk  r|}|S |dkD  r|}|S ||dz
  |z  z   }|S |d	z  }|S )
z4Calculate quality-based confidence score for displaylearning_scorer   r  r   r   r   r   r   r   )	r   rW   rf   r   r   r   r   r   r:  )	rI   rZ   r  validation_scorer  quality_minquality_maxscale_factorr   s	            r%   get_quality_confidencez(EmbeddingStrategy.get_quality_confidenceh  s   **+;SA ==,,-DcJ HOtU]G^CCdgFMdT\F]dkkBBcfFMdT\F]dkkBBcgELTS[E\t{{AAbg""'7.'H #(
   #%(
  )NS,@L+PP

  (#-Jr$   r   c                   K   ddl m} ddl}g }g }|D ]Z  }t        |d      r"|j                  r|j                  j
                  nd}|s6|j                  |dd        |j                  |       \ |sydt        j                  d	      d
}	 |||	| j                         d{   }
|j                  %|
|_
        t        t        t        |
                  }ng }g }t        |
      D ]  \  }}||j                  j!                  |      z  }|j                  |j                  j!                  |j                  dd      z  }|j#                  ||      }|j%                  |      | j&                  k  s|j                  |       |j                  |        |r1|j)                  |j                  |j+                  |      g      |_
        |D ]*  }|j,                  j                  ||   j.                         , d| _        d| _        t        |d      rQ|j4                  D| j7                  |j4                  t        | d      r| j8                  j:                  nd      |_        yyy7 w)z9Update embeddings and coverage metrics with deduplicationr   rA  r   Nr_   r   i  rC  rD  rE  TrM  r1   r   r   )rH  rB  rV   rf   r_   r`   r   rI  rJ  r5   r0   r@   r   r   r  rR  rS  rT  r   r>  vstackrX   r/   rd   r<  r;  r1   r{  r   r   r3   )rI   rZ   r   rB  r]   	new_textsvalid_resultsr&  re   r   r  deduplicated_indicesdeduplicated_embeddingsr   new_embnew_emb_normalizedkb_normalizedr  idxs                      r%   r   zEmbeddingStrategy.update_state  sf    . 	! 	-F6=fj6QV\VeVefoo22kmG  $0$$V,		-  8#34 
  39>RTXThThii &"0E#'c..A(B#C  ')##% '7 	3
7%,ryy~~g/F%F" % 3 3biinnUEXEX_`kon6p p!vvm5GH 66,'$*G*GG+227;(//2	3 '&(ii1D1DbhhOfFg0h&i# ( 	=C$$]3%7%;%;<	= $( &*# 5,-%2H2H2T#'#>#>u?U?Ux  AE  GO  yPW[WbWbWtWt  VY  $ZE  3U-I js!   AI&	AI&"I##CI&1C3I&)NN)r   )r   )r   )rr   rs   rt   r|   rg   r   ro   r   r
   rK  rZ  r_  r"   rp  r~   r{  r~  r	   r  r   r  r   r   r   r   r  r   r   r  r   r   r#   r$   r%   r5  r5  L  s   I- - -
49 
 
 S UX 4+C +PS +X[ +$E/C E/c E/SV E/N3 u 2
 
T#Y 
 s tTYZ]_dZdTeOf <vFdvF 5e$%vF 	vF
 
eD%K 	!vFp
 u L
j 
. 
TRWX\^cXcRdMe 
>!%Z !%E !%F'z '> 'd 'RJ 5 6;Z
 ;ZkAR ;ZW[ ;Zr$   r5  c            	          e Zd ZdZ	 	 	 d!dee   dee   dee   fdZde	defd	Z
	 d"d
e	de	dee	   defdZde	de	dee   fdZdeeeef      de	dee   fdZedefd       Zedee	ef   fd       Zedefd       Zd#deddfdZde	fdZd$dee	ef   de	ddfdZdee	ef   fdZ d$dee	ef   de	ddfdZ!dee	ef   fdZ"d%de#deee	ef      fd Z$y)&AdaptiveCrawlerz<Main adaptive crawler that orchestrates the crawling processNcrawlerr   r   c                     || _         |xs
 t               | _        | j                  j                          |r|| _        n*| j                  | j                  j                        | _        d | _        |d u | _        y r    )r  r   r   r   r   _create_strategyrZ   _owns_crawler)rI   r  r   r   s       r%   ro   zAdaptiveCrawler.__init__  si     0 0 $DM 11$++2F2FGDM ,0
 %_r$   strategy_namerQ   c                     |dk(  r
t               S |dk(  r5t        | j                  j                  | j                  j                        S t        d|       )z&Create strategy instance based on namer   r  )r5   r6  zUnknown strategy: )r   r5  r   r5   r   
ValueError)rI   r  s     r%   r  z AdaptiveCrawler._create_strategy  sW    M)&((k)$ $ ; ;;;;; 
 1-ABBr$   	start_urlr   resume_fromc           	        K   |r,t         j                  |      | _        || j                  _        nt        t	               g g |i       | _        | j
                  s1t               | _        | j
                  j                          d{    | j                  | j                  _        t        | j                  t              r|s| j                  j                  || j                  j                         d{   \  }}|| j                  _        |dd | j                  _        | j                  j                   | j                  _        	 || j                  j"                  vr| j%                  ||       d{   }|rt'        |d      r|j(                  r| j                  j*                  j-                  |       | j                  j"                  j/                  |       t'        |d      r|j0                  rt        |j0                  t2              r|j0                  j5                  dg       D cg c]  }t7        di | }}|j0                  j5                  dg       D cg c]  }t7        di | }	}| j                  j8                  j;                  ||	z          nP| j                  j8                  j;                  |j0                  j<                  |j0                  j>                  z          | j                  jA                  | j                  |g       d{    d}
|
| j                  jB                  k  r| j                  jE                  | j                         d{   }|| j                  jF                  d	<   | j                  jI                  | j                  | j                         d{   rn| j                  jK                  | j                  | j                         d{   }|sn|d   d   | j                  jL                  k  rn|d| j                  jN                   D cg c]+  \  }}|jP                  | j                  j"                  vr||f- }}}|snR| jS                  ||       d{   }|r| j                  j*                  j;                  |       tU        ||      D ]W  \  }\  }}|s| j                  j"                  j/                  |jP                         t'        |d      sI|j0                  sVg }t        |j0                  t2              rj|j0                  j5                  dg       D cg c]  }t7        di | }}|j0                  j5                  dg       D cg c]  }t7        di | }	}||	z   }n-|j0                  j<                  |j0                  j>                  z   }|D ]J  }|jP                  | j                  j"                  vs&| j                  j8                  j-                  |       L Z | j                  jA                  | j                  |       d{    |
dz  }
| j                  jV                  rE| j                  jX                  r/| j                  j[                  | j                  jX                         |
| j                  jB                  k  r| j                  jE                  | j                         d{   }t        | j                  t              r=| j                  j]                  | j                        | j                  jF                  d	<   n|| j                  jF                  d	<   t_        | j                  j"                        | j                  jF                  d
<   |
| j                  jF                  d<   | j                  jV                  rE| j                  jX                  r/| j                  j[                  | j                  jX                         | j                  | j`                  r3| j
                  r&| j
                  jc                  ddd       d{    S S S 7 7 _7 c c}w c c}w 7 H7 7 7 }c c}}w 7 c c}w c c}w 7 D7 7 ?# | j`                  r4| j
                  r'| j
                  jc                  ddd       d{  7   w w w xY ww)z&Main entry point for adaptive crawling)r   r   r   r   r   Nr   successra   internalexternalr   r   pages_crawleddepth_reachedr#   )2r   rT   rZ   r   r+   r  r   
__aenter__r   r   
isinstancer5  rp  r   r1   r2   r5   r   _crawl_with_previewrf   r  r   r   r%  ra   rC   rW   r   r   extendr  r  r   r   r   r   r   r   r   r   r  _crawl_batchr  r   r   rP   r  r   r  	__aexit__)rI   r  r   r  r1   r2   r&  rK   internal_linksexternal_linksdepthr   ranked_linksr  to_crawlr   r  	new_links	link_datanew_linkr  s                        r%   digestzAdaptiveCrawler.digest  s     #5DJ$DJJ# U! DJ ||*,DL,,))+++#{{ dmm%677;}}7]7]..8 2.. +;DJJ'*:12*>DJJ')-)F)FDJJ&l	?

 7 77#77	5IIgfi8V^^JJ--44V<JJ++//	:vw/FLL%fllD9GM||GWGWXbdfGg-htdlTl-hN-hGM||GWGWXbdfGg-htdlTl-hN-h JJ44;;N^<[\ !JJ44;;FLL<Q<QTZT`T`TiTi<ij --44TZZ&JJJ E$++///#'==#E#Edjj#QQ
3=

""<0 224::t{{KKK &*]]%=%=djj$++%VV#  ?1%(F(FF >JJb4;;KbKb=c HkdE"iitzz/F/FF "5M H H   %)$5$5h$FFJJ--44[A .1h-G R)	q! JJ3377		B&vw7FLL,.	#-fllD#AY_YeYeYiYijtvxYy5zId6GY6G5zN5zY_YeYeYiYijtvxYy5zId6GY6G5zN5z0>0OI 170E0EH]H]0]I 1: !RH'/}}DJJ<S<S'S(,

(@(@(G(G(Q!R!R* --44TZZMMM
 ;;))dkk.D.DJJOODKK$:$:;{ $++///@ $(==#E#Edjj#QQN $--):;37==3W3WX\XbXb3c

""<0 4B

""<025djj6M6M2NDJJ/27DJJ/ {{%%$++*@*@

 6 67:: !!dllll,,T4>>> '3!u ,2 J .i-h K R L  WH G 6{5z N R* ? !!dllll,,T4>>> '3!sP  BcaA.c5a6Ac.b 6a7B<b 3a b %a7B%b a$Ab %a'&Ab 6a*79b 0a-1Ab 60a0&b a6Ab ;b b :b a9 b =a>Ab ,Ab ?b A=b ?(b 'b(D)b 8c	b	
ccb b 'b *b -b 0b 9b b 	c9ccccrd   c                   K   t        t        dd|dddd      d      }	 | j                  j                  ||       d{   }t	        |d	      r|j
                  r|j
                  d
   }t	        |d      rH|j                  r<|j                  d   D cg c]  }|j                  d      s| c}|j                  d<   |S 7 c c}w # t        $ r}t        d| d|        Y d}~yd}~ww xY ww)z%Crawl a URL with link preview enabledTFr   2   )include_internalinclude_externalr   concurrencytimeout	max_linksverbose)link_preview_configscore_links)rd   r   N_resultsr   ra   r  r  zError crawling z: )
r   r   r  arunrf   r  ra   rW   rz  r$  )rI   rd   r   r   r&  rK   es          r%   r  z#AdaptiveCrawler._crawl_with_previewv  s     ! 1!%!&! 
	<<,,V,DDFvz*v+ vw'FLL=C\\*=U+oTY]YaYabmYnD+oZ( M E ,p
  	OC51#./	sY   C1 C
 CAC
 C-C1C
 C1C
 C
 
	C.C)$C1)C..C1links_with_scoresc                   K   g }|D ]2  \  }}| j                  |j                  |      }|j                  |       4 t        j                  |ddi d{   }g }|D ]  }	t        |	t              rQt        |	d      r|	j                  r|j                  |	       =t        dt        |	d      r|	j                  nd        dt        |	t              sut        d|	         |S 7 w)	zCrawl multiple URLs in parallelreturn_exceptionsTNr  zSkipping failed crawl: rd   unknownzError in batch crawl: )r  r  r   asynciogatherr  r   rf   r  r$  rd   rz  )
rI   r  r   tasksrK   r  taskresultsr  r&  s
             r%   r  zAdaptiveCrawler._crawl_batch  s     , 	KD%++DIIu=DLL	  FFF  	9F&+.69-&..!((03'&RWBXFJJ^g3hijFI..vh78	9  Gs   AC&C$A<C&C&c                 h    | j                   r&| j                   j                  j                  dd      S y)zCurrent confidence levelr   r   )rZ   r   rW   r   s    r%   r   zAdaptiveCrawler.confidence  s+     ::::%%)),<<r$   c                 r   | j                   si S t        d | j                   j                  D              }t        | j                   j                        |t        | j                   j
                        t        | j                   j
                  j                               t        | j                   j                        | j                  | j                   j                  j                  dd      | j                   j                  j                  dd      | j                   j                  j                  dd      d	S )zDetailed coverage statisticsc              3   b   K   | ]'  }t        |j                  j                  xs d        ) yw)r   N)r   r_   r`   )r  r&  s     r%   r  z1AdaptiveCrawler.coverage_stats.<locals>.<genexpr>  s,      #
 ,,23#
s   -/r   r   r   r   )	r  total_content_lengthunique_termstotal_termsr   r   r   r   r   )rZ   r   r   r   r   r'   r   r   r   r   rW   )rI   r  s     r%   coverage_statszAdaptiveCrawler.coverage_stats  s     zzI" #
**33#
  
 !!8!89$8

 ; ;<tzz::AACD !9!9://

**..z3?::--11-E**,,00sC

 
	
r$   c                     t        | j                  t              r| j                  j                  S | j                  | j
                  j                  k\  S )z(Check if current knowledge is sufficient)r  r   r5  r:  r   r   r   r   s    r%   is_sufficientzAdaptiveCrawler.is_sufficient  s>     dmm%67==333 ??dkk&F&FFFr$   detailedc                      j                   st        d       y	 ddlm} ddlm}  |       }d}|s(|r% d j                   j                   d	
      }|j                  ddd       |j                  dd        j                  }|j                  dt        |j                  dd                   |j                  dt        |j                  dd                   |j                  dt        |j                  dd                   |j                  d|j                  dd      dd       |j                  dt        |j                  dd                   |j                  dd       t         j                  t              r|j                  d|j                  dd      d        |j                  d! j                   j                   j                  d"d      d#       |j                  d$ j                   j                   j                  d%d      d&       |j                  d' j                   j                   j                  d(d      d        |j                  dd       |j                  d) j"                  rd*nd+       n|j                  d|j                  dd      d        |j                  d,|j                  d-d      d        |j                  d.|j                  d/d      d        |j                  d0|j                  d1d      d        |j                  dd       |j                  d) j"                  rd2nd+       j                  |       yt        d3       t        d4 j                   j                   d	       t        d5       t        d6       t        d7t%         j                   j&                                t        d8t%         j                   j(                                t        d9 j                   j*                          t-         fd: j                   j.                  D              }t-         j                   j0                  j3                               }	t%         j                   j0                        }
t        d;       t        d<|dd=       t        d>|	d       t        d?|
d       |	dkD  rt        d@|
|	z  d        t         j                  t              rt        dA       t        dB j                   j                   j                  d"d      d#       t        dC j                   j                   j                  d%d      d&       t        dD j                   j                   j                  dEd      d&       t        dF        j"                  r>|r!j                  dG j4                  d dH       nXt        dG j4                  d dI       n=|r!j                  dG j4                  d dJ       nt        dG j4                  d dK       t        dL j                   j                   j                  dMd      d        t        dN j                   j                   j                  d(d      d        ngt        dO        j                  j7                   j                   j                  j9                               }|D ]  } j                   j0                  j                  |d      } j                   j:                  j                  |d      }|dkD  r|rNj                  dP| dQ| dR j                   j*                   dS| j                   j*                  z  dTdU| dV       t        dP| dQ| dR j                   j*                   dW| j                   j*                  z  dTdX| dV       |rj                  dP| dY       t        dP| dZ        t        dF        j"                  rd[nd\}|r3 j"                  rd]nd^}j                  dG j4                  d d_|        nt        dG j4                  d d_|        t        d` j                   j                   j                  d-d      d        t        da j                   j                   j                  d/d      d        t        db j                   j                   j                  d1d      d         j                   j<                  rt-         j                   j<                        t%         j                   j<                        z  }t        dc       t        dd|d&       t        de j                   j                   j                  d1d      d        |rt        df       |rj                  dg       nt        dh       t        di       t        dj       t?         j                   j0                  jA                         dk dl      ddm }tC        |dn      D ]i  \  }\  }} j                   j:                  j                  |d      }|r j                  do|dpdq| dr| ds| dt	       Qt        do|dpdu| dv| ds| dt	       k t        dwt%         j                   j&                         dx       tC         j                   jD                  dn      D ]  \  }}|t%         j                   j<                        k  r j                   j<                  |dnz
     nd}|r.j                  do| dy| dz       |j                  d{| d|       vt        do| d}|        t        d~| d        t        d       i } j                   j:                  j3                         D ]  }|j                  |d      dnz   ||<    t?        |jG                               D ]  }||   }t        d| d| d         j                   jH                  rbt        d       t        d j                   jH                          t        dt%         j                   jJ                                 j                   jL                  -t        d j                   jL                  jN                          nt        d       t        dt%         j                   jP                                t        d j4                  d         j                   jJ                  rbt        d       tC         j                   jJ                  dd dn      D ]1  \  }}|rj                  do| d| d       !t        do| d}|        3 t        d3       y# t        $ r d}Y Vw xY w)zPrint comprehensive statistics about the knowledge base
        
        Args:
            detailed: If True, show detailed statistics including top terms
        zNo crawling state available.Nr   )Console)TableTFzAdaptive Crawl Stats - Query: '')r  Metriccyan)styleno_wrapValuemagenta)r*  zPages Crawledr  zUnique Termsr  zTotal Termsr  zContent Lengthr  ,z charszPending Linksr   r   
Confidencer   z.2%zAvg Min Distanceavg_min_distancez.3fzAvg Close Neighborsavg_close_neighborsz.1fzValidation Scorer  zIs Sufficient?z[green]Yes (Validated)[/green]z[red]No[/red]Coverager   Consistencyr   
Saturationr   z[green]Yes[/green]zQ
================================================================================z$Adaptive Crawl Statistics - Query: 'zP================================================================================z
[*] Basic Statistics:z  Pages Crawled: z  Pending Links: z  Total Documents: c              3   R   K   | ]  }t        j                  |               y wr    )r   _get_content_from_result)r  r&  rI   s     r%   r  z.AdaptiveCrawler.print_stats.<locals>.<genexpr>  s)      ' D11&9:'s   $'z
[*] Content Statistics:z  Total Content: z charactersz  Total Words: z  Unique Terms: z  Vocabulary Richness: z 
[*] Semantic Coverage Analysis:z  Average Min Distance: z  Avg Close Neighbors (< 0.3): z$  Avg Very Close Neighbors (< 0.2): avg_very_close_neighborsz
[*] Confidence Metrics:z  Overall Confidence: z [green][VALIDATED][/green]z [VALIDATED]z [red][NOT VALIDATED][/red]z [NOT VALIDATED]z  Learning Score: r  z  Validation Score: z
[*] Query Coverage:z  'z': found in /z docs ([green]z.0%z[/green]), z occurrencesz docs (z), z': [red][X] not found[/red]z': [X] not foundz[OK]z[!!]z[green][OK][/green]z[red][!!][/red]r  z  Coverage Score: z  Consistency Score: z  Saturation Score: z
[*] Crawl Efficiency:z  Avg New Terms per Page: z  Information Saturation: zQ
--------------------------------------------------------------------------------z*[bold cyan]DETAILED STATISTICS[/bold cyan]zDETAILED STATISTICSzP--------------------------------------------------------------------------------z
[+] Top 20 Terms by Frequency:c                     | d   S r   r#   r   s    r%   r&   z-AdaptiveCrawler.print_stats.<locals>.<lambda>m  s    VWXYVZ r$   r   r   r   z  2dz. [yellow]'z'[/yellow]: z occurrences in z docsz. 'z': z
[+] URLs Crawled (z):z. [cyan]z[/cyan]z     -> Added [green]z[/green] new termsz. z     -> Added z
 new termsz%
[+] Document Frequency Distribution:z  Terms in z docs: z termsz 
[+] Semantic Coverage Analysis:z  Embedding Model: z  Query Variations: z  Knowledge Embeddings: z  Knowledge Embeddings: Nonez  Semantic Gaps: z  Coverage Achievement: z
[+] Query Space (samples):r   z
. [yellow]z	[/yellow]))rZ   r$  rich.consoler%  
rich.tabler&  ImportErrorr   
add_columnr   add_rowrg   rW   r  r   r5  r   r"  r   r   r   r-   r   r   r'   r   r   r   r   r)   r.   r  rD   r  r/   r  r5   r2   r0   rP  r4   )rI   r#  r%  r&  consoleuse_richtablestatsr  total_wordsr  r   r   r   r   statusstatus_coloredavg_new_terms	top_termsr   freqrd   r  	df_countscounteqs   `                         r%   print_statszAdaptiveCrawler.print_stats  sU    zz01	,(iGH H"A$**BRBRASST UVEXVTBWI6 ''EMM/3uyy!/L+MNMM.#eii.J*KLMM-UYY}a-H)IJMM*uyy9OQR/STU.VV\,]^MM/3uyy!/L+MNMM"b! $--):;luyyq/I#.NP0TZZ5G5G5K5KL^`a5bcf4gi3

8J8J8N8NOdfg8hil7mo0TZZ5G5G5K5KLcef5ghk4lnb"%.TXTfTf0Pl{| luyyq/I#.NPjUYYz1-Ec,JLm		-0KC/PRluyyq/I#.NPb"%.HZHZ0D`opMM%  - 89I9I8J!LM&M +,%c$***A*A&B%CDE%c$***B*B&C%DEF'

(B(B'CDE $' '"jj77' $  djj99@@BCKtzz::;L-/%&:1%=[IJOK?34$\!$456Q/[0H/MNO $--):;9;01C1C1G1GHZ\]1^_b0cde7

8J8J8N8NOdfg8hil7mno<TZZ=O=O=S=STnpq=rsv<wxy 13%%(>ts>SSn&op 6ts6K<XY(>ts>SSn&op 6ts6KK[\]*4::+=+=+A+ABRTU+VWZ*[\],TZZ-?-?-C-CD[]^-_`c,def -/"mm55djj6F6F6L6L6NO' @D4488qAB88<<T1EBAv##MMCv\"QtzzGaGaFbbpqstxt~t~  uO  uO  rO  PS  qT  T_  `b  _c  co  +p  q!Cv\"Qtzz?Y?Y>ZZabdeieoeoeeb  AD  bE  EH  IK  HL  LX  #Y  Z##MMCv5P*QR!Cv-=">?@ 13#'#5#56>B>P>P%:VgNMM$:4??3:OqQ_P`"ab24??32GqQR*4::+=+=+A+A*a+PQT*UVW-djj.@.@.D.D]TU.VWZ-[\],TZZ-?-?-C-CLRS-TUX,YZ[ zz++ #DJJ$@$@ AC

HdHdDe e/12=2EFG24::3E3E3I3I,XY3Z[^2_`am$MM"NO/0f 89"4::#>#>#D#D#FNdhijmkmn	'0A'> VOA|d88<<T1EB1R&D6dVScdfcggl&mn1R&D6TF:J2$eTUV ,S1H1H-I,J"MN'

(>(>B FFAsEF#djjNjNjJkEk

 < <QqS AqrI1#XcU'&BC(=i[HZ&[\1#Ruo.ykDEF >?	**99@@B =B$-MM"a$81$<IbM= !!12 BB%bMEKt75'@AB
 ::--=>/

0J0J/KLM0TZZ5P5P1Q0RSTzz//; 89Q9Q9W9W8XYZ <>-c$**2J2J.K-LMN4T__S4IJK zz22<=%.tzz/J/J2A/NPQ%R 6EAr' '1#Zt9.M N %1#Rtn 5	6 - s  	H	s   u8 8vvc                     t        |d      rQ|j                  rEt        |j                  d      r|j                  j                  xs dS t        |j                        S y)z,Helper to safely extract content from resultr_   r`   r   )rf   r_   r`   rg   )rI   r&  s     r%   r6  z(AdaptiveCrawler._get_content_from_result  sG    6:&6??v7339r9v''r$   filepathformatc                 $   | j                   r| j                   j                  st        d       yt        |      }|j                  j                  dd       |dk(  rt        |dd      5 }| j                   j                  D ]<  }| j                  |      }|j                  t        j                  |d	
      dz          > 	 ddd       t        dt        | j                   j                         d|        yt        d|       # 1 sw Y   FxY w)zExport the knowledge base to a file
        
        Args:
            filepath: Path to save the file
            format: Export format - currently supports 'jsonl'
        zNo knowledge base to export.NTr8   jsonlr;   utf-8encodingF)ensure_ascii
z	Exported z documents to zUnsupported export format: )rZ   r   r$  r   r>   r?   rF   _crawl_result_to_export_dictwriterG   dumpsr   r  )rI   rO  rP  rO   r&  result_dicts         r%   export_knowledge_basez%AdaptiveCrawler.export_knowledge_base  s     zz!:!:01>dT:Whg6 P!"jj77 PF"&"C"CF"KKGGDJJ{G$NO	PP Ic$**";";<=^H:VW:6(CDDP Ps   *ADDc                    t        |dd      t        |dd      t        |dd      | j                  r| j                  j                  ndd}t        |d      rU|j                  rIt        |j                  d	      r|j                  j
                  |d
<   nt        |j                        |d
<   nd|d
<   t        |d      r|j                  |d<   t        |d      r|j                  |d<   | j                  r|d   | j                  j                  v r+| j                  j                  j                  |d         dz   nd| j                  j                  j                  dd      | j                  j                  d|d<   |S )z.Convert CrawlResult to a dictionary for exportrd   r   	timestampNr  T)rd   r^  r  r   r_   r`   re   rb   ra   r   r   r   )r/   confidence_at_crawlr-   crawl_metadata)r  rZ   r   rf   r_   r`   rg   rb   ra   r/   indexr   rW   r-   )rI   r&  export_dicts      r%   rX  z,AdaptiveCrawler._crawl_result_to_export_dict  s^    65"- d;vy$7)-TZZ%%	
 6:&6??v7)/)E)EI&),V__)=I&%'K	" 6:&&,ooK
# 67##)<<K  ::WbchWimqmwmw  nD  nD  XDtzz55;;K<NORSS  JK'+zz'9'9'='=lA'N#'::#=#=-K() r$   c                    t        |      }|j                         st        d|       |dk(  rg }t        |dd      5 }|D ]J  }|j	                         st        j                  |      }| j                  |      }|j                  |       L 	 ddd       | j                  st               | _	        | j                  j                  j                  |       t        j                  | j                  j!                  | j                  |             t#        dt%        |       d|        yt'        d	|       # 1 sw Y   xY w)
zImport a knowledge base from a file
        
        Args:
            filepath: Path to the file to import
            format: Import format - currently supports 'jsonl'
        zFile not found: rR  rS   rS  rT  Nz	Imported z documents from zUnsupported import format: )r   existsFileNotFoundErrorrF   r  rG   loads_import_dict_to_crawl_resultr   rZ   r   r   r  r  runr   r   r$  r   r  )rI   rO  rP  imported_resultsrO   linedatamock_results           r%   import_knowledge_basez%AdaptiveCrawler.import_knowledge_base  s     > #&6xj$ABBW!hg6 =! =Dzz|#zz$/&*&G&G&M(//<== ::'\
 JJ%%,,-=> KK224::?OPQIc"2344DXJOP:6(CDD)= =s   D:9D::Erk  c                 @     G d d       G fdd      } ||      S )z0Convert imported dict back to a mock CrawlResultc                       e Zd Zd Zy)BAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockMarkdownc                     || _         y r    rm   rn   s     r%   ro   zKAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockMarkdown.__init__  rp   r$   Nrq   r#   r$   r%   ru   rp    rv   r$   ru   c                       e Zd Z fdZy)EAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockCrawlResultc                 $   |j                  dd      | _         |j                  dd            | _        |j                  di       | _        |j                  di       | _        |j                  dd      | _        |j                  d      | _        y )	Nrd   r   re   ra   rb   r  Tr^  )rW   rd   r_   ra   rb   r  r^  )rI   rk  ru   s     r%   ro   zNAdaptiveCrawler._import_dict_to_crawl_result.<locals>.MockCrawlResult.__init__  sr    88E2. ,TXXi-D E!XXgr2
 $R 8#xx	48!%+!6r$   Nrq   rz   s   r%   r{   rs    s    7r$   r{   r#   )rI   rk  r{   ru   s      @r%   rg  z,AdaptiveCrawler._import_dict_to_crawl_result  s$    	, 	,	7 	7 t$$r$   top_kc                 n   | j                   r| j                   j                  sg S g }t        | j                   j                  j	                         j                               }t        | j                   j                        D ]  \  }}|j                  j                  xs dj	                         }t        |j                               }t        ||z        }|r|t        |      z  nd}	|j                  |j                  |	|j                  j                  |d        |j                  d d       |d| S )z'Get most relevant content for the queryr   r   )rd   r  re   ra  c                     | d   S )Nr  r#   r   s    r%   r&   z6AdaptiveCrawler.get_relevant_content.<locals>.<lambda>6  s
    qz r$   Tr   N)rZ   r   r+   r   r   r/  r  r_   r`   r   r   rd   r  )
rI   ru  scored_docsr   r   r&  re   content_termsr   r  s
             r%   get_relevant_contentz$AdaptiveCrawler.get_relevant_content  s
   zz!:!:I $****00288:;"4::#<#<= 	IAv339r@@BG0M +56G2=Gc+..3Ezz!??77	  	  	14@6E""r$   )NNNr    )F)rR  )r   )%rr   rs   rt   r|   r   r   r   r   ro   rg   r  r   r  r   r  r   r	   r   r~   r  propertyr   r   r
   r   r   r"  rM  r6  r   r   r\  rX  rm  rg  r"   rz  r#   r$   r%   r  r    s   F 7;4859-"?3-!.1- $M2-(
Cc 
Cm 
C =AP?*-P?&)P? -5SMP? FPP?dS  +AV @DtU{9K4L UX ]abm]n 0 E   
S#X 
 
, Gt G GI!D I!T I!V# EeCI.> E EZ^ E6#d38n #J!EeCI.> !E !EZ^ !EF%c3h %"## #d4S>6J #r$   r  )(r|   abcr   r   typingr   r   r   r   r	   r
   r   dataclassesr   r   r  picklerI  rG   r   collectionsr   r   r-  pathlibr   crawl4ai.async_webcrawlerr   crawl4ai.async_configsr   r   crawl4ai.modelsr   r   r   r   r   r   r5  r  r#   r$   r%   <module>r     s    $ ? ? ? (   	   , 	  5 F - {
 {
 {
| ]j ]j ]j@C 0w/- w/t	r	Z r	Zjv	# v	#r$   