
    /iZ<                         d Z ddlZddlZddlmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZmZ dd	lmZ  G d
 d      Zy)z
Link Extractor for Crawl4AI

Extracts head content from links discovered during crawling using URLSeeder's
efficient parallel processing and caching infrastructure.
    N)DictListOptionalAny   )AsyncLogger)AsyncUrlSeeder)SeedingConfigCrawlerRunConfig)LinksLink)calculate_total_scorec                      e Zd ZdZddee   fdZd Zd Zd Z	d Z
dd	ed
edefdZdededefdZdedeeef   dee   fdZdee   deeef   deeeef      fdZdee   dedeeef   deeeef      fdZdedeeeef      dedefdZy)LinkPreviewa  
    Extracts head content from links using URLSeeder's parallel processing infrastructure.
    
    This class provides intelligent link filtering and head content extraction with:
    - Pattern-based inclusion/exclusion filtering
    - Parallel processing with configurable concurrency
    - Caching for performance
    - BM25 relevance scoring
    - Memory-safe processing for large link sets
    Nloggerc                 .    || _         d| _        d| _        y)z
        Initialize the LinkPreview.
        
        Args:
            logger: Optional logger instance for recording events
        NF)r   seeder_owns_seeder)selfr   s     Q/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/link_preview.py__init__zLinkPreview.__init__   s     04!    c                 B   K   | j                          d{    | S 7 w)zAsync context manager entry.N)startr   s    r   
__aenter__zLinkPreview.__aenter__)   s     jjl 	s   c                 @   K   | j                          d{    y7 w)zAsync context manager exit.N)close)r   exc_typeexc_valexc_tbs       r   	__aexit__zLinkPreview.__aexit__.   s     jjls   c                    K   | j                   sEt        | j                        | _         | j                   j                          d{    d| _        yy7 w)z"Initialize the URLSeeder instance.)r   NT)r   r	   r   r   r   r   s    r   r   zLinkPreview.start2   sC     {{(<DK++((*** $D *s   AAAAc                    K   | j                   rA| j                  r4| j                   j                  ddd       d{    d| _         d| _        yyy7 w)zClean up resources.NF)r   r   r"   r   s    r   r   zLinkPreview.close9   sJ     ;;4,,++''dD999DK %D -;9s   9AAAlevelmessagetagc                     | j                   r6t        | j                   |d      }|r ||||j                  di              yyy)z%Helper method to safely log messages.Nparams)r&   r'   r)   )r   getattrget)r   r%   r&   r'   kwargs
log_methods         r   _logzLinkPreview._log@   s@    ;; eT:J7FJJxQS<TU  r   linksconfigreturnc                   K   |j                   }| j                          d{    | j                  ||      }|s| j                  dd       |S | j                  dddt	        |      i       | j                  ||       d{   }| j                  |||      }| j                  dddt	        |D cg c]  }|j                  d	      d
k(  s| c}      i       |S 7 7 ]c c}w w)ab  
        Extract head content for filtered links and attach to Link objects.
        
        Args:
            links: Links object containing internal and external links
            config: CrawlerRunConfig with link_preview_config settings
            
        Returns:
            Links object with head_data attached to filtered Link objects
        Ninfoz#No links matched filtering criteriaz2Extracting head content for {count} filtered linkscountr)   z9Completed head extraction for links, {success} successfulsuccessstatusvalid)link_preview_configr   _filter_linksr.   len_extract_heads_parallel_merge_head_datar+   )r   r/   r0   link_configfiltered_urlshead_resultsupdated_linksrs           r   extract_link_headszLinkPreview.extract_link_headsG   s      00 jjl **5+>IIfCDL		&N!3}#56 	 	8 "99-UU --e\6J		&U#S\)`QUU8_X_E_!)`%ab 	 	d + 	 V *as:    C(CA C(C!.C(2C#C#C(!C(#C(r>   c                 r   g }|j                   ri|j                  |j                  D cg c]  }|j                  s|j                   c}       | j	                  dddt        |j                        i       |j                  ri|j                  |j                  D cg c]  }|j                  s|j                   c}       | j	                  dddt        |j                        i       |j                  }|rC|D cg c]  t        fd|D              r }}| j	                  dddt        |      i       |j                  }|rC|D cg c]  t        fd|D              s }}| j	                  dd	dt        |      i       |j                  }|d
kD  r)t        |      |kD  r|d| }| j	                  ddd|i       t               }	g }
|D ])  |	vs|	j                         |
j                         + | j	                  dddt        |
      i       |
S c c}w c c}w c c}w c c}w )a,  
        Filter links based on configuration parameters.
        
        Args:
            links: Links object containing internal and external links
            link_config: Configuration dictionary for link extraction
            
        Returns:
            List of filtered URL strings
        debugzAdded {count} internal linksr4   r5   zAdded {count} external linksc              3   J   K   | ]  }t        j                   |        y wNfnmatch.0patternurls     r   	<genexpr>z,LinkPreview._filter_links.<locals>.<genexpr>   s     UwsG4U    #z,After include patterns: {count} links remainc              3   J   K   | ]  }t        j                   |        y wrG   rH   rJ   s     r   rN   z,LinkPreview._filter_links.<locals>.<genexpr>   s     YW7??38YrO   z,After exclude patterns: {count} links remainr   NzLimited to {max_links} links	max_linksz)Final filtered URLs: {count} unique links)include_internalextendinternalhrefr.   r;   include_externalexternalinclude_patternsanyexclude_patternsrQ   setaddappend)r   r/   r>   r?   linkrX   rM   rZ   rQ   seenunique_urlss         `    r   r:   zLinkPreview._filter_linksp   sK     ''  !T$))$))!TUIIg=%s5>>':;  = ''  !T$))$))!TUIIg=%s5>>':;  = '77,UDTUU M  IIgM%s='9:  < '77,YHXYY M  IIgM%s='9:  <  ))	q=S/);)*95MIIg=)95  7 u  	(C$""3'	(
 			'F!3{#34 	 	6 a "U "Us"   H%H%H*0H*H/H4urlsc           	        K   |j                   }|j                  }|r | j                  ddt        |      |d       t	        d|t        |dd      |j                  |j                  |j                  rdnd|	      }|r| j                  |||       d{   }|S | j                  j                  ||||j                  
       d{   }|S 7 97 w)aV  
        Extract head content for URLs using URLSeeder's parallel processing.
        
        Args:
            urls: List of URLs to process
            link_config: Configuration dictionary for link extraction
            
        Returns:
            List of dictionaries with url, status, head_data, and optional relevance_score
        r3   zNStarting batch processing: {total} links with {concurrency} concurrent workers)totalconcurrencyr5   Thits_per_secNbm25)extract_headrd   re   queryscore_thresholdscoring_methodverbosera   r0   rd   timeout)rk   rd   r.   r;   r
   r*   rh   ri   _extract_with_progressr   extract_head_for_urlsrm   )r   ra   r>   rk   rd   seeding_configresultss          r   r<   z#LinkPreview._extract_heads_parallel   s      %%!--IIfn'*4yM  O '# ndC##'77%0%6%66D
  77nkZZG  !KK==%'#++	 >  G  [s$   BCC2CCCCrp   c           	      x  K   t        |      }|j                  }t        d|dz        }d}d}d}	| j                  dd       | j                  j                  ||||j                         d{   }
|
D ]&  }|dz  }|j                  d      d	k(  r|dz  }"|	dz  }	( | j                  dd
||||	d       |
S 7 Jw)z-Extract head content with progress reporting.r   
   r   r3   zProcessing links in batches...rl   Nr7   r8   zcBatch processing completed: {completed}/{total} processed, {successful} successful, {failed} failed)	completedrc   
successfulfailedr5   )r;   rd   maxr.   r   ro   rm   r+   )r   ra   rp   r>   
total_urlsrd   
batch_sizert   ru   rv   rq   results               r   rn   z"LinkPreview._extract_with_progress   s      Y
!--J",-
 	
 			&:; 99!#''	 : 
 
  	FNIzz(#w.a
!	 			&#,)$. &	 	 	 1
s   A+B:-B8.AB:original_linksr@   c                 J   i }|D ]^  }|j                  d      }|s|j                  di       |j                  dd      |j                  d      |j                  d      d||<   ` g }|j                  D ](  }|j                  |v r||j                     }	|	j                  d      }
t        |j                  |j                  |j
                  |j                  |	d   |	d   |	j                  d      t        |dd	      |

	      }|
$|j                  xs i |_        |
|j                  d<   t        |j                  |j                  t        |dd      t        |j                  j                              |_        |j!                  |       |j!                  |       + g }|j"                  D ](  }|j                  |v r||j                     }	|	j                  d      }
t        |j                  |j                  |j
                  |j                  |	d   |	d   |	j                  d      t        |dd	      |

	      }|
$|j                  xs i |_        |
|j                  d<   t        |j                  |j                  t        |dd      t        |j                  j                              |_        |j!                  |       |j!                  |       + t%        d ||z   D              r)d }|j'                  |d       |j'                  |d       t)        ||      S )a(  
        Merge head extraction results back into Link objects.
        
        Args:
            original_links: Original Links object
            head_results: Results from head extraction
            
        Returns:
            Links object with head_data attached to matching links
        rM   	head_datar7   unknownerrorrelevance_score)r}   r7   r   r   intrinsic_scoreN)	rU   texttitlebase_domainr}   head_extraction_statushead_extraction_errorr   contextual_scorescore_linksF)r   r   score_links_enabledquery_providedc              3   p   K   | ].  }t        |d       xr |j                  xr d|j                  v  0 yw)r}   r   Nhasattrr}   )rK   r^   s     r   rN   z/LinkPreview._merge_head_data.<locals>.<genexpr>}  s?      @ t[)ddnndARVZVdVdAdd @s   46c                 n    t        | d      r)| j                  rd| j                  v r| j                  d   S y)Nr}   r   g        r   )r^   s    r   get_relevance_scorez9LinkPreview._merge_head_data.<locals>.get_relevance_score  s4    4-$..EVZ^ZhZhEh>>*;<<r   T)keyreverse)rT   rW   )r+   rT   rU   r   r   r   r   r*   r}   r   r   r   boolr9   rh   total_scorer]   rW   rY   sortr   )r   r{   r@   r0   url_to_head_datarz   rM   updated_internalr^   	head_infor   updated_linkupdated_externalr   s                 r   r=   zLinkPreview._merge_head_data  s&   " " 	F**U#C!'K!<$jj9=#ZZ0'-zz2C'D	) %	 "++ "	.Dyy,,,TYY7	#,==1B#C #** $ 0 0'4+4X+>*3--*@$+D2CT$J%5
  $/-9-C-C-IrL*@PL**+<= ,A$0$@$@%1%B%B(/u(M#'(B(B(H(H#I	,( !''5 !''-E"	.J "++ "	.Dyy,,,TYY7	#,==1B#C #** $ 0 0'4+4X+>*3--*@$+D2CT$J%5
  $/-9-C-C-IrL*@PL**+<= ,A$0$@$@%1%B%B(/u(M#'(B(B(H(H#I	,( !''5 !''-E"	.J  @+.>>@ @
 !!&94!H!!&94!H%%
 	
r   rG   )LINK_EXTRACT)__name__
__module____qualname____doc__r   r   r   r   r"   r   r   strr.   r   r   rC   r   r   r   r:   r<   r
   rn   r=    r   r   r   r      sP   		"x4 	"
%&V# V V# V'' !' 
	'R?5 ?tCH~ ?$s) ?B-3i- #s(^- 
d38n		-^23i2 &2 #s(^	2
 
d38n	2hw
w
 4S>*w
 !	w

 
w
r   r   )r   asynciorI   typingr   r   r   r   async_loggerr   async_url_seederr	   async_configsr
   r   modelsr   r   utilsr   r   r   r   r   <module>r      s3      , , % , :  (y
 y
r   