
    /ir                    	   d dl Z d dlmZmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZddlmZ d dlmZ ddlmZmZ ddlmZmZmZmZmZ d dlZd d	lmZ d d
lm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,Z,d dl-m.Z. d dl/Z/d dl0Z0d dl1Z1d dl2Z2d dl3m4Z4 d dl5Z5d dlm6Z6mZ7 d dl8Z8d dl9Z9d dl:m;Z; d dl<Z<d dl3m=Z= d dl>m?Z? ddl@mAZA d dl!mBZB d dlCmDZD d dlEmFZF d dlGZGd dlHZId dl*m+Z+mJZJmKZKmLZLmMZMmNZNmOZO d dlPZPd dl:mQZQ d dlZeQj                  ZSd ZTeTeQ_R        	 	 dde)eU   deVdeVdeWde%e&eUge$eU   f      de'eUddf   fd ZX	 	 	 dd!eBeU   d"eVdeVd#eWd$e&de$eU   fd%ZY G d& d'      ZZ G d( d)      Z[ G d* d+e\      Z] e^g dddddddddddddddddddddddddddddddddddddddddddddddddd d d d d d d d d d dddddddd d d d d d d d d d d d d d d d d d d d d d d d d d ddddddd d d d d d d d d d d d d d d d d d d d d d d d d d dddddgd,z        Z_h d-Z`d.eUdeaeU   fd/Zb	 	 	 	 dd2eUd3eUd4eVd5ecd6ecdeUfd7Zdd8 Zed9 Zfd: ZgdeUfd;Zhd< Zid= Zjd> Zkd.eUdeUfd?Zld@ ZmddAZnedfdBZoedfdCeUdDeUdEeVdFeUde"eUe#f   f
dGZpddHZqddIZrdJ ZsdK ZtdL Zu	 	 	 	 	 ddMZv	 	 	 	 	 ddNZweddfdOZxddPZydQ Zz	 ddCeUdReadSeUdTeUdeaf
dUZ{dV Z|dW Z}dX Z~dY Zd0d0d1dd1ddZd[eUd\eUfd]Zdd^Z e=d_`      dda       Zdb ZdDeUd\eUde"eUe$e"eUeUf      f   fdcZdCeUdeUfddZdCeUdeeUdecfdfZdgeaeU   deaeU   fdhZdi ZdjeUdeUfdkZdleUdeUfdmZdneUde"eUeUf   fdoZdp ZddqeVfdrZds ZddtZ G du dv      ZddwZdx Zdy ZdzeUd{eUd|eUd\eUdef
d}Zd~eUdCeUdeUdeUdeUdedeWfdZ	 	 	 	 dde%eW   de%eW   decdecdeWf
dZ	 	 	 dde$eU   de%e"   deUdeVdeIj.                  f
dZ	 	 	 dde$eU   de%e"   deUdeVdeIj.                  f
dZdeIj.                  deIj.                  deWfdZdeIj.                  deIj.                  deWfdZdeWfdZdeWfdZde(eWeWeWf   fdZde"eUe&f   de"eUeUf   fdZy)    N)ThreadPoolExecutoras_completed)BeautifulSoupCommentelementTagNavigableString   )PROMPT_EXTRACT_BLOCKS)array)	html2textCustomHTML2Text)MIN_WORD_THRESHOLD$IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDIMAGE_SCORE_THRESHOLDDEFAULT_PROVIDERPROVIDER_MODELS)gaierror)Path)DictAnyListOptionalCallable	GeneratorTupleIterableurljoin)InvalidSchemawraps)etreehtml)RobotFileParser)	lru_cache)version)__version__)Sequence)chain)deque)r   urlparse
urlunparse	parse_qsl	urlencodequoteunquote)RuleLinec                    d| j                   v sd| j                   v s| j                   dv r| j                   j                  dd      }t        j                  |      j                  dd      }d|z   }|j	                  d      r|d d d	z   }	 t        t        j                  ||            S t        | |      S # t        j                  $ r t        | |      cY S w xY w)
N*%2A)r4   r5   z\*z.*^z\$$)	pathreplacereescapeendswithboolmatcherrororiginal_applies_to)selffilenamepatterns      J/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/utils.pypatched_applies_torF   ;   s    	TYY%499,		\0I""5#.w7#++E48www


5
!Sb\C'76rxx233 dH
-- hh 6%dH556s   B6 6CC	documentschunk_token_thresholdoverlapword_token_rate	tokenizerreturnc              #     K   t               }t               }d}| D ]  }|r ||      }	dgt        |	      z  }
n|j                         }	|gt        |	      z  }
|j                  |	       |j                  |
       |t	        |
      z  }||k\  ssg }g }d}|rT|d   }||z   |kD  rnF||z  }|j                  |j                                |j                  |j                                |rT|s>|j                  |j                                |j                  |j                                d}d}t        |      D ]  }||z   |kD  r n||z  }|dz  } |dkD  rE|| d }|| d }|j                  t        |             |j                  t        |             ||z  }|t	        |      z  }dj                  |r|dt        |      |z
   n|       ||k\  rI |rdj                  |       yyw)a  
    Efficiently chunks documents into token-limited sections with overlap between chunks.

    Args:
        documents: Iterable of document strings
        chunk_token_threshold: Maximum tokens per chunk
        overlap: Number of tokens to overlap between chunks
        word_token_rate: Token estimate per word when not using a tokenizer
        tokenizer: Function that splits text into tokens (if available)

    Yields:
        Text chunks as strings
                  ?r   r
   N )
r+   lensplitextendsumappendpopleftreversed
extendleftjoin)rG   rH   rI   rJ   rK   token_queuecontribution_queuecurrent_token_countdoctokenscontributionschunk_tokenschunk_contribchunk_totalnext_contriboverlap_totaloverlap_idxcontriboverlap_tokensoverlap_contribs                       rE   chunk_documentsri   L   s8    ( 'K 8js^F ECK/MYY[F,-F;M 	6"!!-0s=11 "%::LMK %1!4-0EE|+$$%7%?%?%AB##K$7$7$9: % !$$%7%?%?%AB##K$7$7$9:  MK#M2 ! 7*W4(q 	! Q!-{lm!<"/">&&x'?@"--h.GH#}4#  3}#55((;<(F\):;)FG\hiiS "%::8jv hh{## s   BG1AG1*C+G1G1docstarget_sizeword_token_ratiosplitterc                    |xs t         j                  }t        d      }g }d}| D ]K  } ||      }	t        t	        |	      |z        }
|
s%|j                  |
       |j                  |	       ||
z  }M |sg S t        d||z   dz
  |z        }t        |      D cg c]  }g  }}d}d}t        j                  |      D ]b  }	||k\  rB||dz
  k  r:|dkD  r.||   | d }|dz  }||   j                  |       t	        |      }n|dz  }d}||   j                  |	       |dz  }d |D cg c]  }|sdj                  |       c}S c c}w c c}w )a`  
    Merges a sequence of documents into chunks based on a target token count, with optional overlap.
    
    Each document is split into tokens using the provided splitter function (defaults to str.split). Tokens are distributed into chunks aiming for the specified target size, with optional overlapping tokens between consecutive chunks. Returns a list of non-empty merged chunks as strings.
    
    Args:
        docs: Sequence of input document strings to be merged.
        target_size: Target number of tokens per chunk.
        overlap: Number of tokens to overlap between consecutive chunks.
        word_token_ratio: Multiplier to estimate token count from word count.
        splitter: Callable used to split each document into tokens.
    
    Returns:
        List of merged document chunks as strings, each not exceeding the target token size.
    Ir   r
   NrP   )strrR   r   intrQ   rU   maxranger*   from_iterablerS   rY   )rj   rk   rI   rl   rm   token_counts
all_tokenstotal_tokensr]   r^   count
num_chunks_chunks
curr_chunk	curr_sizerg   chunks                     rE   merge_chunksr      s   . $399H:L"$JL "#CK"223&f%E!L" 	 Q3a7KGHJ+0+<=ar=F=JI %%j1 #
Z!^(C{!'
!3WHI!>a
z")).9/	a
	z!!&)Q	 *095CHHUO99+ >* :s   	E'E
/E
c                   $    e Zd Zd Zd Zd Zd Zy)VersionManagerc                 d    t        j                         dz  | _        | j                  dz  | _        y )N	.crawl4aizversion.txt)r   homehome_dirversion_filerB   s    rE   __init__zVersionManager.__init__   s%    		k1 MMM9    c                     | j                   j                         sy	 t        j                  | j                   j	                         j                               S # t        $ r
}Y d}~yd}~ww xY w)z*Get the version recorded in home directoryN)r   existsr'   parse	read_textstrip	Exception)rB   _exs     rE   get_installed_versionz$VersionManager.get_installed_version   sV      '')	==!2!2!<!<!>!D!D!FGG 		s   :A 	A+&A+c                 V    | j                   j                  t        j                         y)z2Update the version file to current library versionN)r   
write_textr(   r   s    rE   update_versionzVersionManager.update_version   s    $$[%<%<=r   c                 ~    | j                         }t        j                  t        j                        }|du xs ||k  S )z/Check if database needs update based on versionN)r   r'   r   r(   )rB   	installedcurrents      rE   needs_updatezVersionManager.needs_update   s9    ..0	-- 7 78D 7I$77r   N)__name__
__module____qualname__r   r   r   r    r   rE   r   r      s    :>8r   r   c                   j    e Zd ZdZddZd Zdedeeef   fdZ	dedefd	Z
dd
ededefdZd Zd Zy)RobotsParseri:	 Nc                 R   |xs) t         j                  j                  t               dd      | _        |xs | j
                  | _        t        j                  | j                  d       t         j                  j                  | j                  d      | _        | j                          y )Nr   robotsTexist_okzrobots_cache.db)
osr9   rY   get_home_folder	cache_dir	CACHE_TTL	cache_ttlmakedirsdb_path_init_db)rB   r   r   s      rE   r   zRobotsParser.__init__   si    "\bggll?3DkS[&\"4dnn
DNNT2ww||DNN4EFr   c                     t        j                  | j                        5 }|j                  d       |j                  d       |j                  d       d d d        y # 1 sw Y   y xY w)NzPRAGMA journal_mode=WALa  
                CREATE TABLE IF NOT EXISTS robots_cache (
                    domain TEXT PRIMARY KEY,
                    rules TEXT NOT NULL,
                    fetch_time INTEGER NOT NULL,
                    hash TEXT NOT NULL
                )
            z=CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)sqlite3connectr   executerB   conns     rE   r   zRobotsParser._init_db  sZ    __T\\* 
	ZdLL23LL   LLXY
	Z 
	Z 
	Zs   4AA&domainrL   c                 "   t        j                  | j                        5 }|j                  d|f      }|j	                         }|s
	 ddd       y|\  }}}|t        j
                         |z
  | j                  k  fcddd       S # 1 sw Y   yxY w)z+Get cached rules. Returns (rules, is_fresh)zASELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?N)NF)r   r   r   r   fetchonetimer   )rB   r   r   cursorresultrules
fetch_timerz   s           rE   _get_cached_ruleszRobotsParser._get_cached_rules  s    __T\\* 	Fd\\S	F __&F"	F 	F $* E:q499;3t~~EE	F 	F 	Fs   'B+BBcontentc           
         t        j                  |j                               j                         }t	        j
                  | j                        5 }|j                  d|f      }|j                         }|r|d   |k7  r1|j                  d||t        t        j                               |f       ddd       y# 1 sw Y   yxY w)z7Cache robots.txt content with hash for change detectionz.SELECT hash FROM robots_cache WHERE domain = ?r   zINSERT OR REPLACE INTO robots_cache 
                       (domain, rules, fetch_time, hash) 
                       VALUES (?, ?, ?, ?)N)hashlibmd5encode	hexdigestr   r   r   r   r   rq   r   )rB   r   r   hash_valr   r   r   s          rE   _cache_ruleszRobotsParser._cache_rules%  s    ;;w~~/0::<__T\\* 	d\\@	F __&F VAY(2. Wc$))+&6A		 	 	s   AB99Curl
user_agentc                 t  K   	 t        |      }|j                  }|sy	 | j                  |      \  }}|s	 |j                  xs d}| d| d}	t        j                         4 d{   }
|
j                  |	dd      4 d{   }|j                  d	k(  r+|j                          d{   }| j                  ||       n"	 ddd      d{    ddd      d{    yddd      d{    ddd      d{    |syt               }|j                  |j                                |j                         sy|j                  ||      S # t        $ r
}Y d}~yd}~ww xY w7 7 7 7 7 7 # 1 d{  7  sw Y   xY w7 # 1 d{  7  sw Y   xY w# t        $ r
}Y d}~yd}~ww xY ww)
a2  
        Check if URL can be fetched according to robots.txt rules.
        
        Args:
            url: The URL to check
            user_agent: User agent string to check against (default: "*")
            
        Returns:
            bool: True if allowed, False if disallowed by robots.txt
        TNhttpz://z/robots.txt   F)timeoutssl   )r,   netlocr   r   schemeaiohttpClientSessiongetstatustextr   r%   r   
splitlinesmtime	can_fetch)rB   r   r   parsedr   r   r   is_freshr   
robots_urlsessionresponseparsers                rE   r   zRobotsParser.can_fetch9  s    	c]F]]F  008x 0& &xs6(+>
"002 ( (g&{{:qe{L ( (PX#??c1*2--/$9E --fe<#'( (( ( (( (( (  !"U%%'( ||~
C00G  		(($9((( ( ( (( ( ( (  s#  F8E F80F" &E)'F" *FE,F#E6*E.+E6FE0FF" E2F" #F8$F/E40F4F" ?F F" AF8	E&F8!E&&F8)F" ,F.E60F2F" 4F6F	<E?=F	FF" FFFF" "	F5+F80F55F8c                     t        j                  | j                        5 }|j                  d       ddd       y# 1 sw Y   yxY w)z#Clear all cached robots.txt entrieszDELETE FROM robots_cacheNr   r   s     rE   clear_cachezRobotsParser.clear_cacheo  s6    __T\\* 	5dLL34	5 	5 	5s	   ;Ac                     t        j                  | j                        5 }t        t	        j                               | j
                  z
  }|j                  d|f       ddd       y# 1 sw Y   yxY w)z&Remove only expired entries from cachez-DELETE FROM robots_cache WHERE fetch_time < ?N)r   r   r   rq   r   r   r   )rB   r   expire_times      rE   clear_expiredzRobotsParser.clear_expiredt  sW    __T\\* 	Zddiik*T^^;KLLH;.Y	Z 	Z 	Zs   >A''A0NN)r4   )r   r   r   r   r   r   rp   tupler>   r   r   r   r   r   r   r   rE   r   r      sj     IZF Fc4i0@ F 3  (413 41C 41$ 41l5
Zr   r   c                       e Zd Zy)InvalidCSSSelectorErrorN)r   r   r   r   r   rE   r   r   {  s    r   r      >B      –   —   •   …   ™   ⇒   ≈   ≤   ≥   ►   ▼!=&&*=+=---=/=<<=>>>?.?:??||<=>===r4   rz      ©   «   ®   »   →!"#r8   %&()+,-./:;<=>?@[\]r6   `{|}~++, ==r   c                    g }t        d      }d}t        |       }||k  r| |   }t        |      }|dk  r6t        |   r-|r|j	                  |j                                t        d      }np||dz
  k  rW|| |dz      z   }|t        v r2|r*|j	                  |j                                t        d      }|dz  }n#|j	                  |       n|j	                  |       |dz  }||k  r|r|j	                  |j                                |S )Nur      r
   )r   rQ   ordSPLITSrU   	tounicodeHTML_CODE_CHARS)r   r   worditext_lencharo	two_charss           rE   advanced_splitr0    s    F:D	A4yH
h,AwI s7vaydnn./SzAtAE{*IO+MM$.."23 :DQD!KK	Q+ h,. dnn&'Mr   TFmessagetypewidthadd_newlinesdouble_linec                 >   dddddd}|j                  |j                         |d         \  }}}dd	d
}	|rdnd}
|	|
   \  }}}}}}g }| j                  d      }|r| d|d   j                          }t	        j
                  ||dz
        }|j                  |j                  d             |dd D ]o  }|j                         rLt	        j
                  d|j                          |dz
        }|j                  |j                  d             _|j                  d       q ||dz
  z  }d| d| | | d| d	g|D cg c]%  }d| d| d| d|d|dz
   dd| d| d| d| d' c}d| d| | | d| d	}dj                  |      }|rd| d}|S c c}w )a  
    Create a styled message box with colored borders and formatted text.

    How it works:
    1. Determines box style and colors based on the message type (e.g., info, warning).
    2. Wraps text to fit within the specified width.
    3. Constructs a box using characters (single or double lines) with appropriate formatting.
    4. Adds optional newlines before and after the box.

    Args:
        message (str): The message to display inside the box.
        type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
        width (int): Width of the box. Defaults to 120.
        add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
        double_line (bool): Whether to use double lines for the box border. Defaults to False.

    Returns:
        str: A formatted string containing the styled message box.
    )yellowbright_yellowu   ⚠)bluebright_blueu   ℹ)
lightblackbright_blacku   ⋯)greenbright_greenu   ✓)red
bright_red   ×)warninginfodebugsuccessr@   rC  )u   ─u   │u   ┌u   ┐u   └u   ┘)u   ═u   ║u   ╔u   ╗u   ╚u   ╝)singledoublerG  rF  
rP   r      )r3  r
   N   r  r  z[/z] r  r   z][)	r   lowerrR   r   textwrapfillrS   rU   rY   )r1  r2  r3  r4  r5  stylesborder_color
text_colorprefix	box_chars
line_styleh_linev_linetltrblbrformatted_lines	raw_lines
first_linewrapped_firstlinewrappedhorizontal_lineboxr   s                             rE   create_box_messagerc    s'   : 6.63,F (.zz$**,v'O$L*f =<I )hJ%.z%:"FFBB Od#Ixq1!3!3!5 67
 j	B}22489abM 	+Dzz|"--"TZZ\N(;519M&&w}}T':;&&r*	+ 	*O
L>2$/t2l^1E (

 ~Qvha
|2d1U1WIP+5FbTVWcVddeflemmop|o}}~

 L>2$/t2l^1EC YYs^FfXRM

s   *Fc                      t        j                         } t               dz  }t        d| dz        }t	        |dz        }t        ||      S )a  
    Calculate the optimal semaphore count based on system resources.

    How it works:
    1. Determines the number of CPU cores and total system memory.
    2. Sets a base count as half of the available CPU cores.
    3. Limits the count based on memory, assuming 2GB per semaphore instance.
    4. Returns the minimum value between CPU and memory-based limits.

    Returns:
        int: The calculated semaphore count.
       @r
   r   )r   	cpu_countget_system_memoryrr   rq   min)rf  	memory_gb
base_countmemory_based_caps       rE   calculate_semaphore_countrl    sI     I!#w/IQ	Q'J9q=)z+,,r   c                    	 t        j                         } | dk(  rYt        dd      5 }|D ]=  }|j                  d      st	        |j                         d         dz  c cddd       S  	 ddd       y| dk(  r@d	dl} |j                  g d
      j                  d      }t	        |j                               S | dk(  rd	dl
		j                  j                  }	j                   G 	fdd	j                        } |       }	j                  |      |_        |j#                  	j%                  |             |j&                  S t)        d      # 1 sw Y   yxY w)af  
    Get the total system memory in bytes.

    How it works:
    1. Detects the operating system.
    2. Reads memory information from system-specific commands or files.
    3. Converts the memory to bytes for uniformity.

    Returns:
        int: The total system memory in bytes.

    Raises:
        OSError: If the operating system is unsupported.
    Linuxz/proc/meminforz	MemTotal:r
      NDarwinr   )sysctlz-nz
hw.memsizeutf-8Windowsc            
           e Zd ZdW j                  fdW j                  fdW  fdW  fdW  fdW  fdW  fdW  fd	W  fg	Zy
))get_system_memory.<locals>.MEMORYSTATUSEXdwLengthdwMemoryLoadullTotalPhysullAvailPhysullTotalPageFileullAvailPageFileullTotalVirtualullAvailVirtualullAvailExtendedVirtualN)r   r   r   c_ulong_fields_)c_ulonglongctypess   rE   MEMORYSTATUSEXrv  L  sZ    V^^,0--#[1#[1"K0"K0*K8
Hr   r  zUnsupported operating system)platformsystemopen
startswithrq   rR   
subprocesscheck_outputdecoder   r  windllkernel32r  	Structuresizeofrw  GlobalMemoryStatusExbyrefry  OSError)
r  memr_  r  outputr  r  memoryStatusr  r  s
           @@rE   rg  rg  +  s=     __F/3' 	73 7??;/tzz|A/$66	7 	77	7 	7 
8	((()GHOOPWX6<<>""	9	==))((	V-- 	 &' &n =%%fll<&@A(((455C	7 	7s   E  E*EEc            
      P   t         j                  j                  t        j                  dt        j                  dt	        j
                                     d      } t        j                  | d       t        j                  |  dd       t        j                  |  dd       | S )at  
    Get or create the home folder for Crawl4AI configuration and cache.

    How it works:
    1. Uses environment variables or defaults to the user's home directory.
    2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
    3. Returns the path to the home folder.

    Returns:
        str: The path to the Crawl4AI home folder.
    CRAWL4_AI_BASE_DIRECTORYr   Tr   z/cachez/models)r   r9   rY   getenvr   r   r   )home_folders    rE   r   r   a  s     '',,
		&II0$))+>	
 	K KKd+KK;-v&6KK;-w'$7r   c                   K   dddd}|j                  |       } | st        d|        t               }t        j                  j                  || j                          d      }t        j                  j                  |      r&t        |d      5 }|j                         cddd       S d	d
l
m}  |       4 d{   }|j                  |j                  |j                  d}| j                         |vr+t        ddj                  |j!                                      t               }|| j                            j"                  }|st        d|        t        t        j                  j                  || j                          d      d      5 }|j%                  |       ddd       |cddd      d{    S # 1 sw Y    xY w7 # 1 sw Y   ,xY w7 !# 1 d{  7  sw Y   yxY ww)a  Returns the browser executable path using playwright's browser management.
    
    Uses playwright's built-in browser management to get the correct browser executable
    path regardless of platform. This ensures we're using the same browser version
    that playwright is tested with.
    
    Returns:
        str: Path to browser executable
    Raises:
        RuntimeError: If browser executable cannot be found
    chromiumfirefoxwebkit)r  r  r  zUnsupported browser type: z.pathro  Nr   )async_playwrightz&Invalid browser type. Must be one of: r!  z'Browser executable not found for type: w)r   RuntimeErrorr   r   r9   rY   rL  r   r  readplaywright.async_apir  r  r  r  
ValueErrorkeysexecutable_pathwrite)	browser_typebrowser_typesr  	path_filefr  pbrowsersbrowser_paths	            rE   get_chromium_pathr  z  s     M !$$\2L7~FGG "#K[\-?-?-A,B%*HII	ww~~i )S! 	Q668	 	 6!  Q

yyhh
 x/88==?9S8TU 
 &' 2 2 45EE!HWXX"'',,{|/A/A/C.DE,JKSQ 	"UVGGL!	" +  		 	$	" 	"%   sy   BG3G  G3;G<G3?CGG%	G.G3:G;G3 G
	G3G	GG3G0$G'%G0,G3c                 h    t        j                  |       }t        |d      }|j                         }|S )z
    Beautifies an escaped HTML string.

    Parameters:
    escaped_html (str): A string containing escaped HTML.

    Returns:
    str: A beautifully formatted HTML string.
    html.parser)r$   unescaper   prettify)escaped_htmlunescaped_htmlsouppretty_htmls       rE   beautify_htmlr    s1     ]]<0N 7D--/Kr   c                    | j                  d      r$| j                  d      r| dd j                         } g }d}d}t        |       D ]?  \  }}|dk(  r|dk(  r|}|dz  }|dk(  s|dz  }|dk(  s)|j	                  | ||dz           A g }g }|D ])  }	 t        j                  |      }	|j	                  |	       + ||fS # t
        j                  $ r |j	                  |       Y Vw xY w)a  
    Splits a JSON string which is a list of objects and tries to parse each object.

    Parameters:
    json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.

    Returns:
    tuple: A tuple containing two lists:
        - First list contains all successfully parsed JSON objects.
        - Second list contains the string representations of all segments that couldn't be parsed.
    r  r  r
   r   r  r  )r  r=   r   	enumeraterU   jsonloadsJSONDecodeError)
json_stringsegmentsdepthstart_indexr+  r-  parsed_objectsunparsed_segmentssegmentobjs
             rE   split_and_parse_json_objectsr    s    c"{';';C'@!!B'--/ HEK[) B43;zQJES[QJEzK!a% @AB N .	.**W%C!!#&. ,,, ## 	.$$W-	.s   &B??$C&%C&c                 N    | }|j                  dd      j                  dd      }|S )a<  
    Sanitize an HTML string by escaping quotes.

    How it works:
    1. Replaces all unwanted and special characters with an empty string.
    2. Escapes double and single quotes for safe usage.

    Args:
        html (str): The HTML string to sanitize.

    Returns:
        str: The sanitized HTML string.
    r  \"'z\')r:   )r$   sanitized_htmls     rE   sanitize_htmlr    s0      N $++C7??UKNr   c                 .   	 	 | sy| j                  dd      j                  d      S # t        $ r:}t        d|        | j                  dd      j                  d      cY d}~S d}~ww xY w# t        $ r}t        dt        |             |d}~ww xY w)	z3Sanitize input to handle potential encoding issues.rK  rs  ignoreerrorszFWarning: Encoding issue detected. Some characters may be lost. Error: asciiNzError sanitizing input: )r   r  UnicodeEncodeErrorprintr   r  rp   )r   es     rE   sanitize_input_encoder  	  s    E
	I;;wx;8??HH! 	IXYZX[\ ;;wx;8??HH	I  E3CF8<=1DEs<   ( !( 	A+/A& A+!A. &A++A. .	B7BBc                 2   | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  d	d
      } | j                  dd      } | j                  dd      } t        j                  dd |       } | S )z
    Escapes characters in a string to be JSON safe.

    Parameters:
    s (str): The input string to be escaped.

    Returns:
    str: The escaped string, safe for JSON encoding.
    r  z\\r  r  z\bz\frH  z\nz\r	z\tz[\x00-\x1f\x7f-\x9f]c                 R    dj                  t        | j                                     S )Nz\u{:04x})formatr&  group)xs    rE   <lambda>z$escape_json_string.<locals>.<lambda>4  s    +2D2DS^2T r   )r:   r;   sub)ss    rE   escape_json_stringr    s     	
		$A 	
		#uA 	
		$A			$A			$A			$A			$A 	&(TVWXAHr   c                    i dd dd dd dd d	d
 dd dd dd dd dd dd dd dd dd dd dd  d!d" d# d$ d% d& d'}|D cg c]  }||j                  |d(       f }}|D ]B  \  }}| j                  |      D ])  }|r|j                  n ||      }|j                  |       + D | S c c}w ))aN  
    Replace inline HTML tags with Markdown-style equivalents.

    How it works:
    1. Maps specific tags (e.g., <b>, <i>) to Markdown syntax.
    2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
    3. Optionally replaces tags with their text content only.

    Args:
        soup (BeautifulSoup): Parsed HTML content.
        tags (List[str]): List of tags to replace.
        only_text (bool): Whether to replace tags with plain text. Defaults to False.

    Returns:
        BeautifulSoup: Updated BeautifulSoup object with replaced tags.
    bc                 "    d| j                    dS Nz**r   tags    rE   r  z%replace_inline_tags.<locals>.<lambda>L      2chhZr* r   r+  c                 "    d| j                    dS Nr4   r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>M      1SXXJa r   r$  c                 "    d| j                    dS )N__r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>N  r  r   spanc                     | j                    S Nr  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>O      sxxj r   delc                 "    d| j                    dS Nz~~r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>P      Rz, r   insc                 "    d| j                    dS )Nr   r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>Q  r  r   r  c                 "    d| j                    dS )Nr  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>R      Qsxxj? r   supc                 "    d| j                    dS )Nz^^r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>S  r  r   strongc                 "    d| j                    dS r  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>T  s    388*B/ r   emc                 "    d| j                    dS r  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>U  s    AchhZq/ r   codec                 "    d| j                    dS Nr  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>V      azO r   kbdc                 "    d| j                    dS r  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>W  r  r   varc                 "    d| j                    dS Nrz   r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>X  r  r   r  c                 "    d| j                    dS r  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>Y  r  r   qc                 "    d| j                    dS )Nr  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>Z  r  r   abbrc                 F    | j                    d| j                  dd       dS )Nz (titlerK  r  )r   r   r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>[  s#    sxxj3777B+?*@B r   citec                 "    d| j                    dS r	  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>\  r  r   c                 "    d| j                    dS r	  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>]  r  r   c                     | j                    S r  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>^  r  r   c                 "    d| j                    dS )Nz<small>z</small>r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>_  s    wsxxj9 r   c                 "    d| j                    dS )Nr"  r  r  s    rE   r  z%replace_inline_tags.<locals>.<lambda>`  s    b
"- r   )dfnr   smallmarkc                     | j                   S r  r  )ts    rE   r  z%replace_inline_tags.<locals>.<lambda>d  s
    !&& r   )r   find_allr   replace_with)	r  tags	only_texttag_replacementsr  replacement_datatag_namereplacement_funcreplacement_texts	            rE   replace_inline_tagsr$  9  s   $*( 	* 	)	
 	, 	, 	* 	, 	/ 	) 	+ 	* 	* 	* 	(  	B!" 	+#$ +)9-+2 GK?B""3(89:  '7 /""==* 	/C+4sxx:J3:O-.	//
 Ks   B=c                 ~   	 |syt        |d      }|j                  }|rL|j                  |      }|st        d|       |j	                  d      }|D ]  }	|j                  |	        |}g g d}
|j                  dd      D ]{  }|d	   }| j                  d
      d   }|j                  d      r*||vr&|
d   j                  ||j                         d       W|
d   j                  ||j                         d       } |j                  g d      D ]  }|j                           |j                         D ]  }|j                  dk7  si |_         g g g d}|j                  d      D ]8  }|d   j                  |j                  d      |j                  d      dd       : |j                  d      D ]8  }|d   j                  |j                  d      |j                  d      dd       : |j                  d      D ]8  }|d   j                  |j                  d      |j                  d      dd       : |j                  d      D ]F  }|j                  d      }|r!|j                  |j                  |             7|j                          H d } ||      }t!        |g d|j                  dd             }fd! ||      }t"        fd"t$        d#t&        fd$} |||      }d%t$        ffd&d"t$        ffd'} ||      }fd( |      }|j                  d) *      D ]  }|j)                           t+        |      j-                  d+d,      j-                  d-d.      }t/        |      }t1        j2                         }t5               }d|_        |j9                  |      }|j-                  d/d0      }	 t;        ||      }||d||
|d2S # t<        $ r!}t?        d1t+        |             i }Y d}~.d}~ww xY w# t<        $ r)}t?        d3t+        |             t        d4|       |d}~ww xY w)5a4  
    Extract structured content, media, and links from website HTML.

    How it works:
    1. Parses the HTML content using BeautifulSoup.
    2. Extracts internal/external links and media (images, videos, audios).
    3. Cleans the content by removing unwanted tags and attributes.
    4. Converts cleaned HTML to Markdown.
    5. Collects metadata and returns the extracted information.

    Args:
        url (str): The website URL.
        html (str): The HTML content of the website.
        word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
        css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.

    Returns:
        Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
    Nr  z;Invalid CSS selector , No elements found for CSS selector: divinternalexternalaT)hrefr+  r  r   r   r)  r+  r   r(  scriptstylelinkmetanoscriptimgimagesvideosaudiosr5  srcaltimage)r8  r9  r2  videor6  audior7  c                 \    | j                  d      D ]  }|j                         |_         | S )Npre)r  get_textstring)nodechilds     rE   replace_pre_tags_with_textz:get_content_of_website.<locals>.replace_pre_tags_with_text  s/    u- 0$~~/0 Kr   r  r+  r$  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  F)r  c                 D   | j                   D ]  }t        |t        j                        s ||       t	        |j                  d      j                               }t	        |j                         dk(  r|j                  d      r||k  s|j                           | S )NTr   r   )contents
isinstancer   r   rQ   r?  rR   	decompose)rA  word_count_thresholdrB  
word_count(remove_empty_and_low_word_count_elementss       rE   rL  zHget_content_of_website.<locals>.remove_empty_and_low_word_count_elements  s     	*eW[[1<3 "%U^^$^%?%E%E%G!HJENN+q0d9S#&::)	* Kr   bodyrJ  c                 D   g }| j                  d      D ]r  }|j                  s|j                  j                         s+t        |j                  j                         j	                               }||k  sb|j                  |       t |D ]  }|j                           | S )NT)r  r@  r   rQ   rR   rU   rI  )rM  rJ  tags_to_remover  rK  s        rE   remove_small_text_tagsz6get_content_of_website.<locals>.remove_small_text_tags  s      N }}T* 3::#**"2"2"4!$SZZ%5%5%7%=%=%?!@J!$88&--c23 &    Kr   r  c                     t        | t              r| j                          S | j                  syt	        fd| j                  D              S )NTc              3   .   K   | ]  } |        y wr  r   ).0rB  is_empty_or_whitespaces     rE   	<genexpr>zIget_content_of_website.<locals>.is_empty_or_whitespace.<locals>.<genexpr>*  s     O-e4Os   )rH  r	   r   rG  all)r  rT  s    rE   rT  z6get_content_of_website.<locals>.is_empty_or_whitespace$  s:    #/99;&<<O#,,OOOr   c                     d}|rEd}| j                  d      D cg c]  } |      s| }}|D ]  }|j                          d} |rE| S c c}w )NTF)r  rI  )rM  changesr  
empty_tagsrT  s       rE   remove_empty_tagsz1get_content_of_website.<locals>.remove_empty_tags,  sn    G $(==#6:PQT:UC
  & #CMMO"G#  Ks
   AAc                 0   | j                   D ]  }t        |t        j                        s |       t	        |j                         dk(  s?|j                   d   j
                  |j
                  k(  sf|j                   d   }|j                  |        | S Nr
   r   )rG  rH  r   r   rQ   namer  )rA  rB  child_contentflatten_nested_elementss      rE   r_  z7get_content_of_website.<locals>.flatten_nested_elements@  s}     	:eW[[1+E2ENN+q0!NN1-22ejj@ ).q(9**=9	: Kr   c                 "    t        | t              S r  )rH  r   r  s    rE   r  z(get_content_of_website.<locals>.<lambda>Q  s    D'9R r   )r@  

rH  rJ  rP       ``````Error extracting metadata:markdowncleaned_htmlrE  medialinksmetadatazError processing HTML content:zInvalid CSS selector: ) r   rM  selectr   new_tagrU   r  rR   r  r?  rI  r]  attrsr   r  
new_stringr$  r   r   rq   extractrp   r:   r  r   	HTML2Textr   ignore_linkshandleextract_metadatar   r  ) r   r$   rJ  css_selectorkwargsr  rM  selected_elementsdiv_tagelri  r*  r+  url_baser  rh  r3  r;  r<  alt_textrC  rP  rZ  commentrg  hrf  r1  r  r_  rT  rL  s                                 @@@rE   get_content_of_websiter}  y  sc   .dVT=1 yy  $L 9$-QR^Q_`  ll5)G' #r"#DR0 s. 	OAV9Dyy~a(Hv&84+?j!(($

)MNj!(($

)MN	O ==!PQ 	CMMO	 ==? 	Cxx5 		
 r:==' 	C(O""swwu~wO	 ]]7+ 	E(O""		%(51A7S	 ]]7+ 	E(O""		%(51A7S	 ==' 	 Cwwu~H  !:;	 	 *$/ #. jje43
:	 8>RS 4F		-0	. &d,@A	P 	P	C 	" !&	 't, }},R}S 	GOO	 4y((6>>tSI %\2 !88L)##Iu5	#D$/D !(
 	
  	.A7D	  V.A7%(>|n&MNTUUVsN   P
 EP
 I8P
 O P
 	P&P=P
 PP
 
	P<$P77P<r   r$   rJ  rt  c                     |syt        |d      }|j                  }j                  dt              j                  dg       xs g D ](  }|j	                  |      D ]  }|j                           * |rJ|j	                  |      }	|	st        d|       |j                  d      }|	D ]  }|j                  |        g g dg g g dfd	fd
dt        j                  dt        f fd|j                  d       fdt              D        D 
cg c]  }
|
|
 c}
d<    |       fd |      }t        j                  d      }D ];  }	 |j                  dd      }|j!                  |      r|j#                  d|      |d<   = t'        |      j)                  dd      j)                  dd      }t+        |      }t-               }d|_        |j1                  |      }|j)                  dd      }	 t3        ||      }||d|dS c c}
w # t$        $ r
}Y d}~d}~ww xY w# t$        $ r!}t5        dt'        |             i }Y d}~Id}~ww xY w)a  
    Extracts and cleans content from website HTML, optimizing for useful media and contextual information.
    
    Parses the provided HTML to extract internal and external links, filters and scores images for usefulness, gathers contextual descriptions for media, removes unwanted or low-value elements, and converts the cleaned HTML to Markdown. Also extracts metadata and returns all structured content in a dictionary.
    
    Args:
        url: The URL of the website being processed.
        html: The raw HTML content to extract from.
        word_count_threshold: Minimum word count for elements to be retained.
        css_selector: Optional CSS selector to restrict extraction to specific elements.
    
    Returns:
        A dictionary containing Markdown content, cleaned HTML, extraction success status, media and link lists, and metadata.
    
    Raises:
        InvalidCSSSelectorError: If a provided CSS selector does not match any elements.
    Nr  $image_description_min_word_thresholdexcluded_tagsz:Invalid CSS selector, No elements found for CSS selector: r&  r'  r4  c                     | }|rB|j                   }|r1|j                  dd      }t        |j                               k\  r|S |rBy )NrP   T)	separatorr   )parentr?  rQ   rR   )r  current_tagtext_contentr  s      rE   $find_closest_parent_with_useful_textzNget_content_of_website_optimized.<locals>.find_closest_parent_with_useful_text  sU    %,,K*33c3N|))+,0TT''  r   c                 0   d }d } || | j                   | j                   j                  dg             sy || |||      }|t        k  ry| j                  dd      j                  dd      j	                         | j                  d	d       |       |d
dS )a0  
        Processes an HTML image element to determine its relevance and extract metadata.
        
        Evaluates an image's visibility, context, and usefulness based on its attributes and parent elements. If the image passes validation and exceeds a usefulness score threshold, returns a dictionary with its source, alt text, contextual description, score, and type. Otherwise, returns None.
        
        Args:
            img: The BeautifulSoup image tag to process.
            url: The base URL of the page containing the image.
            index: The index of the image in the list of images on the page.
            total_images: The total number of images on the page.
        
        Returns:
            A dictionary with image metadata if the image is considered useful, or None otherwise.
        c                     | j                  dd      }| j                  dd      }g dddg}t        d|v|t        fd|| j                  d	d      g|D               |j                  |vg      S )
Nr/  rK  r8  )buttoniconlogor  inputzdisplay:nonec              3   4   K   | ]  }D ]  }||v  
  y wr  r   )rS  r  r  classes_to_checks      rE   rU  zbget_content_of_website_optimized.<locals>.process_image.<locals>.is_valid_image.<locals>.<genexpr>  s2      !1  S s   r9  )r   rV  anyr]  )r3  r  parent_classesr/  r8  tags_to_checkr  s         @rE   is_valid_imagezOget_content_of_website_optimized.<locals>.process_image.<locals>.is_valid_image  s    GGGR(E''%$C9%w/M"%/ $');#Mn#M  
 KK}4	 r   c                 <   d }d }| j                  d      } ||      \  }}| j                  d      }	 ||	      \  }
}d}t        j                  j                  | j                  dd            d   j	                         j                  d	      d}|r|d
k(  r
|dkD  r|dz  }|dv r
|dkD  r|dz  }|
r|d
k(  r
|
dkD  r|dz  }|dv r
|
dkD  r|dz  }|dkD  r|dz  }| j                  d      dk7  r|dz  }t        fddD              r|dz  }||z  dk  r|dz  }|S )a<  
            Scores an HTML image element for usefulness based on size, format, attributes, and position.
            
            The function evaluates an image's dimensions, file format, alt text, and its position among all images on the page to assign a usefulness score. Higher scores indicate images that are likely more relevant or informative for content extraction or summarization.
            
            Args:
                img: The HTML image element to score.
                base_url: The base URL used to resolve relative image sources.
                index: The position of the image in the list of images on the page (zero-based).
                images_count: The total number of images on the page.
            
            Returns:
                An integer usefulness score for the image.
            c                     | rKt        j                  d|       }|r3t        |j                  d            }|j                  d      xs d}||fS y)Nz
(\d+)(\D*)r
   r   pxr   )r;   r?   rq   r  )	dimensionr?   numberunits       rE   parse_dimensionztget_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness.<locals>.parse_dimension  sN    HH]I>E!$U[[^!4!KKN2d   &t|+!r   c                     t        || j                  d            }	 t        j                  |      }|j                  dk(  r|j
                  j                  dd      S t        d|        y# t        $ r Y yw xY w)a  
                Fetches the file size of an image by sending a HEAD request to its URL.
                
                Args:
                    img: A BeautifulSoup tag representing the image element.
                    base_url: The base URL to resolve relative image sources.
                
                Returns:
                    The value of the "Content-Length" header as a string if available, otherwise None.
                r8  r   zContent-LengthNz!Failed to retrieve file size for )r   r   requestsheadstatus_codeheadersr  r    )r3  base_urlimg_urlr   s       rE   fetch_image_file_sizezzget_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness.<locals>.fetch_image_file_size  sx     "(CGGEN; '}}W5H++s2'//334DdKK A'KL#$   s   ?A, A, ,	A87A8heightr3  r   r8  rK  r
   r  r     )r  vhvminvmax   '  r9  c              3   (   K   | ]	  }|k(    y wr  r   )rS  r  image_formats     rE   rU  znget_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness.<locals>.<genexpr>$  s     Of<6)Os   )jpgpngwebp      ?)r   r   r9   splitextrL  r   r  )r3  r  indeximages_countr  r  image_heightheight_valueheight_unitimage_widthwidth_value
width_unit
image_sizescorer  s                 @rE   score_image_for_usefulnessz[get_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness  sV    	" . 778,L(7(E%L+'''*K&5k&B#KJ77++CGGE2,>?BHHJL'--c2LE$&<#+=QJE"==,QSBSQJE%+*;QJE!<<rAQQJEE!
wwu~#
O8NOO
|#c)
Lr   classNr8  rK  r  r  r9  r:  )r8  r9  descr  r2  )r  r   r   r:   r   )r3  r   r  total_imagesr  r  r  r  s          rE   process_imagez7get_content_of_website_optimized.<locals>.process_image  s     	&N	` c3::szz~~gr/JK*3ULI))775"%--eS9??A775"%8=
 	
r   r   rL   c           	      0   	 t        | t              r"t        | t              r | j                          y| j                  dv r | j
                          yd}| j                  dk(  r | j                  d      rn| d   }j                  d      d   }| | j                         d}|j                  d      r||vrd	   j                  |       nd
   j                  |       d}n| j                  dk(  ry| j                  dv rŉ| j                   d   j                   | j                  d       | j                  d      | j                   
|       d        | j                  d      }|D ]W  }| j                   d   j                  |j                  d       | j                  d      | j                   
|       d       Y y| j                  dk7  rj| j                  dv rFj                  dd      r" | j                   | j                                n( | j                          n| j                  dk7  ri | _        t        | j                         D ]M  }t        |t              r0t        |t              s t#        |j%                               dkD  s@d}C |      sLd}O |s/t#         | j                  d      j                               }|k\  }|s | j
                          |S # t&        $ r}	t)        dt+        |	             Y d }	~	yd }	~	ww xY w)NFr-  r*  r+  r  r   r,  r   r)  r(  Tr3  )r;  r<  r  r8  r9  )r8  r9  r2  descriptionsourcer>  rD  r  r   rF  zError processing element:)rH  r	   r   ro  r]  rI  r   rR   r?  r  rU   r  r  unwraprm  listchildrenrQ   r   r   r  rp   )r   keep_elementr+  ry  	link_datasource_tags
source_tagrB  rK  r  r  ru  ri  rh  process_elementr   rJ  s             rE   r  z9get_content_of_website_optimized.<locals>.process_element7  s   e	'?3gw/#GOO%||NN!!!# L||s"{w{{6':v99S>!,%)3C73C3C3EF	??6*xt/C*%,,Y7*%,,Y7#&!33a()00*w{{51*w{{51 ''KG'T	 /g..x8"- 
JW\\N!,-44#->>%#8#.7;;u#5$+LL+O ',		
 ||u$<< $ . zz+u5,,,-=W-=-=-?@&(\\U*$&GM g../ ,e_5j7? 5;;=)A-'+&u-'+,   !1!1!1!=!C!C!EF
)-AA!!!# 	-s1v6	s>   1K- K- B K- 6CK- 	CK- K- #A	K- -	L6LLr3  c           	   3   L   K   | ]  \  }} ||t                      y wr  rQ   )rS  r+  r3  imgsr  r   s      rE   rU  z3get_content_of_website_optimized.<locals>.<genexpr>  s)      
6<aM#sAs4y1
s   !$r5  c                 t   t        | t              r| S t        | j                        dk(  rbt        | j                  d   t        j
                        r;| j                  d   j                  | j                  k(  r | j                  d         S | j                  D cg c]
  } |       c}| _        | S c c}w r\  )rH  r	   rQ   rG  r   r   r]  )rA  rB  r_  s     rE   r_  zAget_content_of_website_optimized.<locals>.flatten_nested_elements  s    dO,K!#4==+W[[9a %%2*4==+;<<EI]]SE07S Ts   B5zdata:image/[^;]+;base64,([^"]+)r8  rK  ra  rH  rJ  rP   Trb  rc  rd  re  )r   rM  r   r   rk  rI  r   rl  rU   r   PageElementr>   r  r  r;   compiler?   r  r   rp   r:   r  r   rq  rr  rs  r  )r   r$   rJ  rt  ru  r  rM  r  rx  rv  r   base64_patternr3  r8  r   rg  r|  rf  r1  r  r  r_  r  r  ri  rh  r  r  s   ` ` `               @@@@@@@@rE    get_content_of_website_optimizedr  w  s   0 }-D99D+1::.0T,( zz/2.4" ++c" 	BLLN	  KK5 )L\N[  ||E"# 	BKKO	 ,ER26E
~
@f!4!4 f f fR ==D
@I$
 	 	E(O D
 #4(DZZ BCN 	''%$C##C(+//C8E
	 t9$$VT2::4EL .LAANxx%H	51Hd+ $ ]:  		  *CF3s0   *H%+8H*I  *	H=8H= 	I*	I%%I*c                 
   i }| s|i S | 	 t         j                  j                  |       }|j	                  d      }|s|S |d   }|j	                  d      }|r|d   nd}|s!|j                  d      }||j                  nd}|s-|j	                  d      xs |j	                  d      }|r|d   nd}|r|j                         nd|d<   |j	                  d	      }|r|d   j                         nd|d
<   |j	                  d      }|r|d   j                         nd|d<   |j	                  d      }	|	r|	d   j                         nd|d<   |j	                  d      }
|
D ]M  }|j                  dd      j                         }|j                  dd      j                         }|sF|sI|||<   O |j	                  d      }|D ]M  }|j                  dd      j                         }|j                  dd      j                         }|sF|sI|||<   O |j	                  d      }|D ]M  }|j                  dd      j                         }|j                  dd      j                         }|sF|sI|||<   O |S # t        $ r i cY S w xY w)zG
    Extract metadata from HTML using lxml for better performance.
    N//headr   z.//title/text()z.//titlez%//meta[@property='og:title']/@contentz&//meta[@name='twitter:title']/@contentr  z%.//meta[@name="description"]/@contentr  z".//meta[@name="keywords"]/@contentkeywordsz .//meta[@name="author"]/@contentauthorz&.//meta[starts-with(@property, "og:")]propertyrK  r   z'.//meta[starts-with(@name, "twitter:")]r]  z+.//meta[starts-with(@property, "article:")])	lxmlr$   document_fromstringr   xpathfindr   r   r   )r$   r]   rj  r  r  title_eltitle_candidatesr  r  r  og_tagsr  property_namer   twitter_tagsarticle_tagss                   rE   extract_metadata_using_lxmlr    s    HCK	
{	))//5C
 99XD7D JJ()EE!H4E 88J'!)!54 II=> @II>? 	 (8 #T */DHW **DEK8Ck!n224H] zz>?H2:8A;,,.HZ ZZ:;F.4*$HX jjABG .
B/557'')R(..0W&-H]#	. ::GHL .+113'')R(..0W&-H]#	. ::KLL .
B/557'')R(..0W&-H]#	. OO  	I	s   I4 4JJc                    i }| s|si S |st        | d      }|j                  }|s|S |j                  d      }|r&|j                  r|j                  j	                         nd|d<   |j                  dddi      }|r |j                  dd	      j	                         nd|d<   |j                  ddd
i      }|r |j                  dd	      j	                         nd|d
<   |j                  dddi      }|r |j                  dd	      j	                         nd|d<   |j                  ddt        j                  d      i      }|D ]M  }	|	j                  dd	      j	                         }
|	j                  dd	      j	                         }|
sF|sI|||
<   O |j                  ddt        j                  d      i      }|D ]M  }	|	j                  dd	      j	                         }
|	j                  dd	      j	                         }|
sF|sI|||
<   O |j                  ddt        j                  d      i      }|D ]M  }	|	j                  dd	      j	                         }
|	j                  dd	      j	                         }|
sF|sI|||
<   O |S )aR  
    Extract optimized content, media, and links from website HTML.

    How it works:
    1. Similar to `get_content_of_website`, but optimized for performance.
    2. Filters and scores images for usefulness.
    3. Extracts contextual descriptions for media files.
    4. Handles excluded tags and CSS selectors.
    5. Cleans HTML and converts it to Markdown.

    Args:
        url (str): The website URL.
        html (str): The HTML content of the website.
        word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
        css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
        **kwargs: Additional options for customization.

    Returns:
        Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
    r  r  Nr1  r]  r  )rm  r   rK  r  r  r  z^og:z	^twitter:z	^article:)	r   r  r  r@  r   r   r  r;   r  )r$   r  rj  r  	title_tagdescription_tagkeywords_tag
author_tagr  r  r  r   r  r  s                 rE   rs  rs  /  s   , H	T6*99D 		'"I$-)2B2B	  W
 iiv}.EiFO6EIr*0024 ]
 99VFJ+?9@L3?B'--/T Z
 6&();<JBL	26<<>RVHX mmF:rzz'7J*KmLG .
B/557'')R(..0W&-H]#	. ==

<8P/Q=RL .+113'')R(..0W&-H]#	. ==
BJJ|<T/U=VL .
B/557'')R(..0W&-H]#	. Or   c                 V    t        j                  d|       }t        t        |            S )z
    Extracts XML tags from a string.

    Args:
        string (str): The input string containing XML tags.

    Returns:
        List[str]: A list of XML tags extracted from the input string.
    z<(\w+)>)r;   findallr  set)r@  r  s     rE   extract_xml_tagsr    s"     ::j&)DD	?r   c                     i }| D ]Z  }d| d| d}t        j                  ||t         j                        }|r#|j                  d      j	                         ||<   Vd||<   \ |S )a  
    Extract data for specified XML tags from a string.

    How it works:
    1. Searches the string for each tag using regex.
    2. Extracts the content within the tags.
    3. Returns a dictionary of tag-content pairs.

    Args:
        tags (List[str]): The list of XML tags to extract.
        string (str): The input string containing XML data.

    Returns:
        Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
    r  >(.*?)</r  r
   rK  )r;   searchDOTALLr  r   )r  r@  datar  rD   r?   s         rE   extract_xml_data_legacyr    so    " D cU(3%q)		'62995A,,.DIDI Kr   c                     i }| D ]\  }d| d| d}t        j                  ||t         j                        }|r%t        |t              j                         }|||<   Xd||<   ^ |S )a,  
    Extract data for specified XML tags from a string, returning the longest content for each tag.

    How it works:
    1. Finds all occurrences of each tag in the string using regex.
    2. For each tag, selects the occurrence with the longest content.
    3. Returns a dictionary of tag-content pairs.

    Args:
        tags (List[str]): The list of XML tags to extract.
        string (str): The input string containing XML data.

    Returns:
        Dict[str, str]: A dictionary with tag names as keys and longest extracted content as values.
    r  r  r  keyrK  )r;   r  r  rr   rQ   r   )r  r@  r  r  rD   matcheslongest_contents          rE   extract_xml_datar    ss    " D 	cU(3%q)**Wfbii8!'s399;O'DIDI	 Kr   c                    ddl m}	 ddlm}
 ddl }d|_        d||d}|rdd	i|d
<   |j                  d      r|j                  |d          t        |      D ]  }	  |	d| d|dgd|}|c S  y# |
$ rm}t        dt        |             ||dz
  k(  r ||dz
  k  r-|||z  z  }t        d| d       t        j                  |       nddgdgdgcY d}~c S Y d}~d}~wt        $ r}|d}~ww xY w)a  
    Perform an API completion request with exponential backoff.

    How it works:
    1. Sends a completion request to the API.
    2. Retries on rate-limit errors with exponential delays.
    3. Returns the API response or an error after all retries.

    Args:
        provider (str): The name of the API provider.
        prompt_with_variables (str): The input prompt for the completion request.
        api_token (str): The API token for authentication.
        json_response (bool): Whether to request a JSON response. Defaults to False.
        base_url (Optional[str]): The base URL for the API. Defaults to None.
        base_delay (int): The base delay in seconds. Defaults to 2.
        max_attempts (int): The maximum number of attempts. Defaults to 3.
        exponential_factor (int): The exponential factor. Defaults to 2.
        **kwargs: Additional arguments for the API request.

    Returns:
        dict: The API response or an error message after all retries.
    r   )
completionRateLimitErrorNT{Gz?temperatureapi_keyr  r2  json_objectresponse_format
extra_argsuserroler   modelmessagesRate limit error:r
   Waiting for  seconds before retrying...r@   )Rate limit error. Please try again later.r  r  r   r   )litellmr  litellm.exceptionsr  drop_paramsr   updaters   r  rp   r   sleepr   )providerprompt_with_variables	api_tokenjson_responser  
base_delaymax_attemptsexponential_factorru  r  r  r  r  attemptr   r  delays                    rE   perform_completion_with_backoffr    s>   D #1G!%)RJ)/(?
$%zz,&./& 	! #)6KLM H
 O  	%s1v.,** ))"&8'&ABUG+FGH

5!
 "#!(	$O#P  "  	G	s+   A00C05ACC0C0)C++C0c                   K   ddl m}	 ddlm}
 ddl }ddl}d|_        d||d}|rdd	i|d
<   |j                  d      r|j                  |d          t        |      D ]  }	  |	d| d|dgd| d{   }|c S  y7 
# |
$ rs}t        dt        |             ||dz
  k(  r ||dz
  k  r3|||z  z  }t        d| d        |j                  |       d{  7   nddgdgdgcY d}~c S Y d}~d}~wt        $ r}|d}~ww xY ww)a  
    Async version: Perform an API completion request with exponential backoff.

    How it works:
    1. Sends an async completion request to the API.
    2. Retries on rate-limit errors with exponential delays (async).
    3. Returns the API response or an error after all retries.

    Args:
        provider (str): The name of the API provider.
        prompt_with_variables (str): The input prompt for the completion request.
        api_token (str): The API token for authentication.
        json_response (bool): Whether to request a JSON response. Defaults to False.
        base_url (Optional[str]): The base URL for the API. Defaults to None.
        base_delay (int): The base delay in seconds. Defaults to 2.
        max_attempts (int): The maximum number of attempts. Defaults to 3.
        exponential_factor (int): The exponential factor. Defaults to 2.
        **kwargs: Additional arguments for the API request.

    Returns:
        dict: The API response or an error message after all retries.
    r   )acompletionr  NTr  r  r2  r  r   r  r  r  r  r  r
   r	  r
  r@   r  r  r   )r  r  r  r  asyncior  r   r  rs   r  rp   r  r   )r  r  r  r  r  r  r  r  ru  r  r  r  r  r  r  r   r  r  s                     rE    aperform_completion_with_backoffr  )  sV    D $1G!%)RJ)/(?
$%zz,&./& 	( #)6KLM  H
 O  	%s1v.,** ))"&8'&ABUG+FGH#gmmE***
 "#!(	$O#P  +  	G	sa   AD	!B 3A>4B 9D	>B  DAC3CC3&D'D	3D?DDD	c                 2   |st        j                  |d      n|}| t        t        |            d}t        }|D ]  }|j                  d|z   dz   ||         } t        ||||      }	 t        dg|j                  d   j                  j                        d   }	t        j                  |	      }	|	D ]  }
d|
d	<   	 	 |	S # t        $ rN t        |j                  d   j                  j                        \  }}|}	|r|	j                  dd
d	g|d       Y |	S w xY w)a  
    Extract content blocks from website HTML using an AI provider.

    How it works:
    1. Prepares a prompt by sanitizing and escaping HTML.
    2. Sends the prompt to an AI provider with optional retries.
    3. Parses the response to extract structured blocks or errors.

    Args:
        url (str): The website URL.
        html (str): The HTML content of the website.
        provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER.
        api_token (Optional[str]): The API token for authentication. Defaults to None.
        base_url (Optional[str]): The base URL for the API. Defaults to None.

    Returns:
        List[dict]: A list of extracted content blocks.
    NURLHTMLr  r  r  blocksr   Fr@   T)r  r@   r  r   )r   r   r  r  r   r:   r  r  choicesr1  r   r  r  r   r  rU   )r   r$   r  r  r  variable_valuesr  variabler   r%  blockr   unparseds                rE   extract_blocksr+  z  sL   * <E##Hd3)I "=#67O
 2# 
 5 = =(NS /(";!


 /'XH!8*h.>.>q.A.I.I.Q.QR
 F# 	#E"E'N	# M  	7Q''//
 MMdWI(S M	s   *AB? ?ADDc                    |st        j                  dd      n|}ddlm} g }| D ]G  \  }}||d}t        }|D ]  }	|j                  d|	z   dz   ||	         } |j                  d|d	g       I  |||d
      }
g }|
D ]Z  }	 t        dg|j                  d   j                  j                        d   }t        j                  |      }|j                  |       \ t        |g       S # t        $ r ddgdgdgdg}Y 5w xY w)at  
    Extract content blocks from a batch of website HTMLs.

    How it works:
    1. Prepares prompts for each URL and HTML pair.
    2. Sends the prompts to the AI provider in a batch request.
    3. Parses the responses to extract structured blocks or errors.

    Args:
        batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs.
        provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192".
        api_token (Optional[str]): The API token for authentication. Defaults to None.

    Returns:
        List[dict]: A list of extracted content blocks from all batch items.
    GROQ_API_KEYNr   )batch_completionr!  r  r  r  r  r  )r  r  r  r%  r@   zZError extracting blocks from the HTML content. Choose another provider/model or try again.z4What went wrong during the block extraction process?)r  r  r   	questions)r   r  r  r.  r   r:   rU   r  r&  r1  r   r  r  r   rT   )
batch_datar  r  r.  r  r   _htmlr'  r  r(  	responses
all_blocksr   r%  s                 rE   extract_blocks_batchr4    sU   $ 8A		.$/iI(H  N
U

 !6' 	H$9$A$Ah$oh&?%!	
 	&5JKLMN !x(PTUIJ "	%xj(2B2B12E2M2M2U2UVF ZZ'F 	&!)", z2  	 $It  O"	F	s   AC''C>=C>c                 (   g }g }d}| D ]d  }t        |j                               dz  }||z   |k  r|j                  |       ||z  }>|r |j                  dj                  |             |g}|}f |r |j                  dj                  |             |S )a  
    Merges small chunks into larger ones based on the total token threshold.

    :param chunks: List of text chunks to be merged based on token count.
    :param token_threshold: Max number of tokens for each merged chunk.
    :return: List of merged text chunks.
    r   g?ra  )rQ   rR   rU   rY   )r{   token_thresholdmerged_sectionscurrent_chunktotal_token_so_farr~   chunk_token_counts          rE   %merge_chunks_based_on_token_thresholdr;    s     OM 3$ 	  11OC  '"33&&v{{='AB"GM!23 v{{=9:r   sectionsr  r  c                    g }|j                  d      r=|D ]6  }|j                  t        | ||||             t        j                  d       8 |S t               5 }|D cg c]  }|j                  t        | ||||       }}t        |      D ]!  }	|j                  |	j                                # 	 ddd       |S c c}w # 1 sw Y   |S xY w)a  
    Process sections of HTML content sequentially or in parallel.

    How it works:
    1. Sequentially processes sections with delays for "groq/" providers.
    2. Uses ThreadPoolExecutor for parallel processing with other providers.
    3. Extracts content blocks for each section.

    Args:
        url (str): The website URL.
        sections (List[str]): The list of HTML sections to process.
        provider (str): The AI provider for content extraction.
        api_token (str): The API token for authentication.
        base_url (Optional[str]): The base URL for the API. Defaults to None.

    Returns:
        List[dict]: The list of extracted content blocks from all sections.
    zgroq/r$  r  N)	r  rS   r+  r   r  r   submitr   r   )
r   r<  r  r  r  extracted_contentsectionexecutorfuturesfutures
             rE   process_sectionsrD    s    , 7# 	G$$sGXy8T JJsO		"   ! 	:X
  (	  "C(IPX   G  'w/ :!((9:	: 	: s   C "B>1C>CCc                 0   g }|j                         }|rrd}|rZ| j                  d||d   z   |      d   |k  r:||j                  d      dz   z  }|r!| j                  d||d   z   |      d   |k  r:|j                  |       |rrdj	                  |      S )a3  
    Wrap text to fit within a specified width for rendering.

    How it works:
    1. Splits the text into words.
    2. Constructs lines that fit within the maximum width using the provided font.
    3. Returns the wrapped text as a single string.

    Args:
        draw (ImageDraw.Draw): The drawing context for measuring text size.
        text (str): The text to wrap.
        font (ImageFont.FreeTypeFont): The font to use for measuring text size.
        max_width (int): The maximum width for each line.

    Returns:
        str: The wrapped text.
    rK  )r   r   r   )fontr   rP   rH  )rR   textbboxpoprU   rY   )drawr   rF  	max_widthlineswordsr_  s          rE   	wrap_textrM  C  s    ( EJJLE
dmmFD58O$mGJiWEIIaL3&&D dmmFD58O$mGJiW 	T  99Ur   c                 :    t        | d      }|j                         S )aO  
    Prettify an HTML string using BeautifulSoup.

    How it works:
    1. Parses the HTML string with BeautifulSoup.
    2. Formats the HTML with proper indentation.
    3. Returns the prettified HTML string.

    Args:
        html_string (str): The HTML string to format.

    Returns:
        str: The prettified HTML string.
    zlxml.parser)r   r  )html_stringr  s     rE   format_htmlrP  c  s      m4D==?r   c                 B   d}d}g }| j                  dd      j                  dd      j                  d      }|D ]  }|j                         s|j                  d      r|d	z  }|j	                  ||z  |z          B|j                  d      r)|j                  d
      r|j	                  ||z  |z          ||j                  d      r|j	                  ||z  |z          |d	z  }|j                         }|s|j	                  ||z  |z           dj                  |      S )z
    A fast HTML formatter that uses string operations instead of parsing.

    Args:
        html_string (str): The HTML string to format

    Returns:
        str: The formatted HTML string
    r   rJ  r  z>
r  z
<rH  z</r
   z/>)r:   rR   r   r  rU   r=   rY   )rO  indent
indent_str	formattedpartspartr   s          rE   fast_format_htmlrW  w  s$    FJI U+33C?EEdKE @zz| ??4 aKFZ&0478 __S!dmmD&9Z&0478 __S!Z&0478aKF jjlG  f!4w!>?-@0 99Yr   c                     ddl m}m}  ||      }|j                  r|j                  st        d|       |j                  j                         dvrt        d|       | j                         } |||      S )*Normalize URLs to ensure consistent formatr   )r   r,   Invalid base URL format: )r   https)urllib.parser   r,   r   r   r  rL  r   )r+  r  r   r,   parsed_basecleaned_hrefs         rE   normalize_urlr_    sz    . 8$K[%7%74XJ?@@  "*;;4XJ?@@::<L 8\**r   )drop_query_tracking
sort_querykeep_fragmentextra_drop_paramspreserve_httpsoriginal_schemer+  r  c                   | syt        || j                               }|ru|dk(  rpt        |      }	t        |      }
|	j                  dk(  rK|	j                  |
j                  k(  r2| j                         j                  d      s|j                  ddd      }t        |      }|j                  j                         }|j                  }|j                  d      r|dk7  r|j                  d      }|j                  }|rt        |d	
      D cg c]  \  }}|j                         |f }}}|rBh d}|r!||D ch c]  }|j                          c}z  }|D cg c]  \  }}||vs||f }}}|r|j                  d        |rt        |d	      nd}|r|j                  nd}t!        |j                  |||j"                  ||f      }|S c c}}w c c}w c c}}w )u  
    Extended URL normalizer

    Parameters
    ----------
    href : str
        The raw link extracted from a page.
    base_url : str
        The page’s canonical URL (used to resolve relative links).
    drop_query_tracking : bool (default True)
        Remove common tracking query parameters.
    sort_query : bool (default True)
        Alphabetically sort query keys for deterministic output.
    keep_fragment : bool (default False)
        Preserve the hash fragment (#section) if you need in-page links.
    extra_drop_params : Iterable[str] | None
        Additional query keys to strip (case-insensitive).

    Returns
    -------
    str | None
        A clean, canonical URL or None if href is empty/None.
    Nr[  r   //http://https://r
   r  T)keep_blank_values>	   refgclidfbclidref_srcutm_term
utm_medium
utm_sourceutm_contentutm_campaignc                     | d   S )Nr   r   )kvs    rE   r  znormalize_url.<locals>.<lambda>
	  s
    r!u r   r  doseqrK  )r   r   r,   r   r   r  r:   rL  r9   r=   rstripqueryr.   sortr/   fragmentr-   params)r+  r  r`  ra  rb  rc  rd  re  full_urlparsed_fullr]  r   r   r9   ry  kvr|  default_trackingr  r{  
normalizeds                         rE   r_  r_    s   D  x.H /W4x(x( &(+"4"44

''-''	:qAH hF ]]  "F ;;D}}Sdck{{3 LLE-6uPT-UVTQ1779a.VV  ! 8I$J1QWWY$JJ )/MA1<L3Lq!fMFMKK,K-17	&-R #0vRH  J ; W %KMs   G7GG%Gc                 d   ddl m}m}m}m}m} | sy ||| j                               }	|ro|dk(  rj ||	      }
 ||      }|
j                  dk(  rK|
j                  |j                  k(  r2| j                         j                  d      s|	j                  ddd	      }	 ||	      }|j                  j                         }d
}|j                  }|r) ||      }g d}|D ]
  }||v s||=  |r
 ||d      nd
} ||j                  ||j                  j                  d      |j                  ||f      }|S )rY  r   )r   r,   r-   parse_qsr/   Nr[  r   rg  rh  ri  r
   rK  )rq  rp  rs  rk  rm  Trv  r  )r\  r   r,   r-   r  r/   r   r   r   r  r:   rL  ry  r9   rx  r|  )r+  r  rd  re  r   r,   r-   r  r/   r}  r~  r]  r   r   r{  ry  r|  tracking_paramsparamr  s                       rE   normalize_url_for_deep_crawlr  	  sG   OO  x.H /W4x(x( &(+"4"44

''-''	:qAH hF ]]  "F H LLE% X$ 	"E5M	"
 28	&-R 3 J r   r  )maxsizec                 
   ddl m} | sy ||| j                               }|ru|dk(  rpt        |      }t        |      }|j                  dk(  rK|j
                  |j
                  k(  r2| j                         j                  d      s|j                  ddd	      }t        |      }t        |j                  |j
                  j                         |j                  j                  d
      |j                  |j                  df      }	|	S )z/Efficient URL normalization with proper parsingr   r   Nr[  r   rg  rh  ri  r
   r  rK  )r\  r   r   r,   r   r   r  r:   r-   rL  r9   rx  r|  ry  )
r+  r  rd  re  r   r}  r~  r]  r   r  s
             rE   &efficient_normalize_url_for_deep_crawlr  X	  s     % x.H /W4x(x( &(+"4"44

''-''	:qAH hF
 3
 J r   c                     	 |j                  d      }|d   }|d   }h d}t         fd|D              r j	                         S  j                  d      r|   S  j                  d      r|   S  j                  d      r	| d|   S  j                  d	      s j                  d
       | d| d  S  j	                         S # t        $ r t        d|       w xY w)rY  r  r   r   rZ     ftp:data:file:tel:mailto:javascript:c              3   \   K   | ]#  }j                         j                  |       % y wr  rL  r  )rS  protor+  s     rE   rU  z$normalize_url_tmp.<locals>.<genexpr>	  s"     
Ie4::<""5)
I   ),r  rg  )rh  ri  z./)rR   
IndexErrorr  r  r   r  lstrip)r+  r  
base_partsprotocolr   special_protocolss   `     rE   normalize_url_tmpr  	  s   A^^C(
a=A
 U

I7H
IIzz| sD6"" tD6"" s2fXdV,, ??23{{4 2fXQtf--::<5  A4XJ?@@As   C
 
C"c                    ddl m} 	  ||       }t        |      }g }g }t	               }|j                  d      D ]  }|j                  dd      j                         }	|	r|	j                  d      r7t        |	|      }
|
r|
|v rJ|j                  |
       |j                         xs dj                         dd	 }|
|d
}t        |
|      r|j                  |       |j                  |        ||dS # t        $ r g g dcY S w xY w)a8  
    Fast link extraction for prefetch mode.
    Only extracts <a href> tags - no media, no cleaning, no heavy processing.

    Args:
        html: Raw HTML string
        base_url: Base URL for resolving relative links

    Returns:
        {"internal": [{"href": "...", "text": "..."}], "external": [...]}
    r   )r  r'  z
//a[@href]r+  rK  )r  r  r  r  Nr   r,  )	lxml.htmlr  r   get_base_domainr  r  r   r   r  r  addr  is_external_urlrU   )r$   r  r  r]   base_domainr(  r)  seenr*  r+  r  r   r  s                rE   quick_extract_linksr  	  s    .0!$' "(+K%'H%'HUDYY|$ 'uuVR &&(t'NO 2$A
Z4/  &B--/5'6	:{3OOI&OOI&''* !h77;  0B//0s   C5 5DDc                 `   	 t        |       j                  j                         }|sy|j                  d      d   }t	        j
                  dd|      }|j                  d      }t        |      dkD  r|d   dv rdj                  |d	d
       S dj                  |dd
       S # t        $ r Y yw xY w)  
    Extract the base domain from a given URL, handling common edge cases.

    How it works:
    1. Parses the URL to extract the domain.
    2. Removes the port number and 'www' prefix.
    3. Handles special domains (e.g., 'co.uk') to extract the correct base.

    Args:
        url (str): The URL to extract the base domain from.

    Returns:
        str: The extracted base domain or an empty string if parsing fails.
    rK  r  r   z^www\.r  r   r7   >   acadaeafagcocomedugovrq   milnetorgN)	r,   r   rL  rR   r;   r  rQ   rY   r   )r   r   rU  s      rE   r  r  	  s    !#%%++- c"1% 	2v. S!u:>eBi ,
 
 88E"#J''xxbc
## s   %B! A$B! B! !	B-,B-r  c                 >    h d}t         fd|D              ry	 t               }|j                  sy|j                  j                         j	                  dd      }|j                         j	                  dd      }|j                  |       S # t        $ r Y yw xY w)r  r  c              3   \   K   | ]#  }j                         j                  |       % y wr  r  )rS  r  r   s     rE   rU  z"is_external_url.<locals>.<genexpr>
  s"     
6399;!!!$
6r  TFzwww.rK  )r  r,   r   rL  r:   r=   r   )r   r  specialr   
url_domainbases   `     rE   r  r  
  s     KG

6g
66#}} ]]((*2262>
  "**626 &&t,,, s   B AB 	BBr^   c                     h d}h d}| D cg c]M  }t        |      dkD  r=||vr9||vr5|j                  d      s$|j                  d      s|j                  d      s|O c}S c c}w )u  
    Clean a list of tokens by removing noise, stop words, and short tokens.

    How it works:
    1. Defines a set of noise words and stop words.
    2. Filters tokens based on length and exclusion criteria.
    3. Excludes tokens starting with certain symbols (e.g., "↑", "▲").

    Args:
        tokens (list[str]): The list of tokens to clean.

    Returns:
        list[str]: The cleaned list of tokens.
    >      ⬆️r*  anatbyinofontoupccpthe   ↑   ▲>   n'tcan'twon'tmustn'tcouldn'twouldn't	shouldn'tr*  r+  amr  asr  ber  doher  isitmemynor  r  orsor  r  uswerV  andr  arebutcandidfewforhadhasherhimhishowitsmaynornotoffouroutsher  waswhowhyyetyoubeenbothdoesdowneachfromhavehersintominemoremostmustnearnoneoursoverpastsomesuchthatthemtheythisuponwerewhatwhenwhomwillwithyouraboutaboveafteralongamongbeingbelowcoulddoingmightothershallsincetheirthesethoseunderuntilwherewhichwhosewouldyoursacrossaroundbeforebehindbesidebeyondcannotduringexcepthavinginsideitselfmyselfshouldtheirstowardunlesswithinagainstbecausebeneathbetweenherselfhimselfoutsidethroughalthoughyourself	ourselves
themselves
underneathr   r  r  u   ⬆)rQ   r  )r^   noise
STOP_WORDStokens       rE   clean_tokensrO  )
  sy    "E"tJr 	u:>#  '  '  ' 		 	 	s   AA"c                 .     t                fd       }|S )a  
    Decorator to profile a function's execution time and performance.

    How it works:
    1. Records the start time before executing the function.
    2. Profiles the function's execution using `cProfile`.
    3. Prints the elapsed time and profiling statistics.

    Args:
        func (Callable): The function to decorate.

    Returns:
        Callable: The decorated function with profiling and timing enabled.
    c                 l   t        j                         }t        j                         }|j	                           | g|i |}|j                          t        j                         |z
  }t        d|dd       t        j                  |      }|j                  d       |j                  d       |S )Nz![PROFILER] Scraping completed in z.2fz seconds
cumulative   )r   perf_countercProfileProfileenabledisabler  pstatsStats
sort_statsprint_stats)	rB   argsru  
start_timeprofilerr   elapsed_timestatsfuncs	           rE   wrapperz!profile_and_time.<locals>.wrapper  s     &&(
 ##% d,T,V, 	 ((*Z7 	1,s1C8LM X&&"r   r!   )rb  rc  s   ` rE   profile_and_timerd    s"      4[ 6 Nr   r   c                 d    t        j                  | j                               j                         S )z"Generate a unique hash for content)xxhashxxh64r   r   )r   s    rE   generate_content_hashrh  =  s!    <<()3355r   	head_htmlc                    | sy| j                         }g }t        j                  d|t        j                        }|r.|j	                  |j                  d      j                                g d}|D ]  \  }}d| dt        j                  |       dd| dt        j                  |       d	g}|D ]J  }t        j                  ||      }	|	s|j	                  |	j                  d      j                                   |syd
j                  |      }
t        j                  |
j                               j                         S )a3  
    Compute a fingerprint of <head> content for cache validation.

    Focuses on content that typically changes when page updates:
    - <title>
    - <meta name="description">
    - <meta property="og:title|og:description|og:image|og:updated_time">
    - <meta property="article:modified_time">
    - <meta name="last-modified">

    Uses xxhash for speed, combines multiple signals into a single hash.

    Args:
        head_html: The HTML content of the <head> section

    Returns:
        A hex string fingerprint, or empty string if no signals found
    rK  z<title[^>]*>(.*?)</title>r
   ))r]  r  )r]  zlast-modified)r  zog:title)r  zog:description)r  zog:image)r  zog:updated_time)r  zarticle:modified_timez
<meta[^>]*z=["\']z%["\'][^>]*content=["\']([^"\']*)["\']z*<meta[^>]*content=["\']([^"\']*)["\'][^>]*z["\']r  )rL  r;   r  r  rU   r  r   r<   rY   rf  rg  r   r   )ri  
head_lowersignalstitle_match	meta_tags	attr_type
attr_valuepatternsrD   r?   combineds              rE   compute_head_fingerprintrs  C  s3   & "JG ))8*biiPK{((+1134I "+ 
	: )FBIIj,A*BBgh9)F299U_K`Jaafg
   	GIIgz2Eu{{1~3356		
  xx H<<)*4466r   	base_pathc                     ddddddd}i }|j                         D ]A  \  }}t        j                  j                  | |      }t        j                  |d       |||<   C |S )	z.Create content directories if they don't existhtml_contentrg  markdown_contentr?  screenshots)r$   cleanedrf  	extractedrx  
screenshotTr   )itemsr   r9   rY   r   )rt  dirscontent_pathsr  dirnamer9   s         rE   ensure_content_dirsr    sq     !&($#D M

 "Www||Iw/
D4(!c"
 r   c                      t        j                         dk(  r(t        j                  t        j                                yy)a  
    Configure the Windows event loop to use ProactorEventLoop.
    This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.

    This function should only be called on Windows systems and before any async operations.
    On non-Windows systems, this function does nothing.

    Example:
        ```python
        from crawl4ai.async_configs import configure_windows_event_loop

        # Call this before any async operations if you're on Windows
        configure_windows_event_loop()
        ```
    rt  N)r  r  r  set_event_loop_policyWindowsProactorEventLoopPolicyr   r   rE   configure_windows_event_loopr    s/      I%%%g&L&L&NO &r   context_linesc                    ddl }ddl}ddl}|j                  | d         }|d   }|j                  }|j
                  }|j                  }	t        d||z
        }
||z   dz   }g }t        |
|      D ]J  }|j                  ||      }|s|j                         }||k(  rdnd}|j                  |dd| d|        L d	j                  |      }	 |j                  j                  |      }|||	|d
S # t        $ r |}Y w xY w)a  
    Extract error context with more reliable line number tracking.

    Args:
        exc_info: The exception info from sys.exc_info()
        context_lines: Number of lines to show before and after the error

    Returns:
        dict: Error context information
    r   Nr   r  r
   r  rP   4drH  )rC   line_nofunctioncode_context)	traceback	linecacher   
extract_tbrC   linenor]  rr   rs   getlinerx  rU   rY   r9   relpathr  )exc_infor  r  r  r   tb
last_framerC   r  	func_namecontext_startcontext_endr+  r_  pointerr  rel_paths                    rE   get_error_contextr    s/     
		hqk	*B BJ""HGI 7]23MM)A-K M=+. =  1-;;=D G|eG  Ab67)1TF!;<= 99]+L77??8, $	 	  s   C0 0C>=C>c                 2    t        |       |kD  r| d | dz   S | S )N...r  )value	thresholds     rE   truncater    s%    
5zIZi 5((Lr   c                 .   t         j                  j                  |       }|j                         D ]  }t	        |j
                        D ](  }t        |j
                  |   |      |j
                  |<   * |j                  r3t        |j                        |kD  rt        |j                  |      |_        |j                  st        |j                        |kD  st        |j                  |      |_	         t         j                  j                  |dd      S )NunicodeF)encodingpretty_print)r  r$   
fromstringiterr  attribr  r   rQ   tailtostring)html_strr  root_elementattrs        rE   optimize_htmlr    s    99)DIIK ?) 	OD$,X__T-BI$NHOOD!	O ==S/);$X]]I>HM ==S/);$X]]I>HM? 99dYUKKr   c                   R    e Zd Zedd       Zedd       Zedefd       ZdefdZy)	HeadPeekrc                 (  K   dddd}	 t        j                  |      4 d {   }|j                  | |d       d {   }|j                  | k7  r0t	        |j                        } |j                  | |       d {   }d	}|j                         2 3 d {   }||z  }d
|v s |j                  d
      d   d
z   cd d d       d {    S 7 7 7 V7 =6 27 # 1 d {  7  sw Y   y xY w# t         j                  t        f$ r Y y w xY ww)Nz&Mozilla/5.0 (compatible; CrawlBot/1.0)z	text/htmlclose)z
User-AgentAccept
Connectionr   T)r  follow_redirects)r  r   s   </head>r   )	httpxAsyncClientr   r   rp   aiter_bytesrR   	HTTPErrorr   )r   r   r  clientr   r   r~   s          rE   fetch_head_sectionzHeadPeekr.fetch_head_section  s(     C!!

	((9 A AV!'CSW!XX <<3&hll+C%+ZZWZ%EEH#+#7#7#9  %u$G!W,}}Z03j@A A AX
  F#9A A A A * 		s   DC3 CC3 C C>C?C CCCC
C)C C3 CC3 DC3 CCCCC3 C0$C'%C0,C3 /D0C3 3DDDDc                 x   K   t         j                  | |       d {   }|r|j                  dd      S y 7 w)Nr  rs  r  r  )r  r  r  )r   r   head_sections      rE   	peek_htmlzHeadPeekr.peek_html  s@     &99#w9OO&&wx&@@ Ps   :8:head_contentc                 j   i }d}t        j                  ||       D ]  }|j                  d      }t        j                  d|      }t        j                  d|      }t        j                  d|      }|sY|s|s^|r|j                  d      n|j                  d      }|j                  d      ||<    |S )Nz<meta[^>]+>r   zname=["\'](.*?)["\']zproperty=["\'](.*?)["\']zcontent=["\'](.*?)["\']r
   )r;   finditerr  r  )	r  rn  meta_patternmeta_tagr  
name_matchproperty_matchcontent_matchr  s	            rE   extract_meta_tagszHeadPeekr.extract_meta_tags  s    	 &L,? 
	8H..#C #:C@JYY'BCHNII&@#FM*-7j&&q)^=Q=QRS=T!.!4!4Q!7	#
	8 r   c                     t        j                  d| t         j                  t         j                  z        }|r|j	                  d      S d S )Nz<title>(.*?)</title>r
   )r;   r  
IGNORECASEr  r  )r  rm  s     rE   	get_titlezHeadPeekr.get_title,  s=    ii 7r}}WYW`W`G`a'2{  #<<r   N)333333?)	r   r   r   staticmethodr  r  rp   r  r  r   r   rE   r  r    sO     0     &= =r   r  c                    	 t        j                  dd      }t        j                  | |      }|j	                  d      }|D ]2  }|j                         |j                         j                  |       4 g d}|D ]O  }	|j	                  d|	       }
|
D ]4  } |j
                          |j
                         j                  |       6 Q |j                         D ]i  } |j
                         h d}g }t        |j                  j                               D ]v  }||v s-|j                  d	      s|j                  j                  |       4||vs9t        |j                  |         |kD  sU|j                  |   d| d
z   |j                  |<   x |j                  rKt        |j                  j                               |kD  r%|j                  j                         d| d
z   |_        |j                   st        |j                   j                               |kD  sE|j                   j                         d| d
z   |_        l i }t        |j	                  d            D ]  }|j                         }||j#                  d      }|s*t%        j&                         }|j)                         D ]  }|j+                  |        |j,                  ||j/                         f}||v r||j                  |       d||<    t        j0                  |dd      }t        |      |kD  r|d| d
z   S |S # t2        $ r}t        |       |kD  r| d| n| cY d}~S d}~ww xY w)a  
    Preprocess HTML to reduce size while preserving structure for schema generation.
    
    Args:
        html_content (str): Raw HTML content
        text_threshold (int): Maximum length for text nodes before truncation
        attr_value_threshold (int): Maximum length for attribute values before truncation
        max_size (int): Target maximum size for output HTML
        
    Returns:
        str: Preprocessed HTML content
    T)remove_commentsremove_blank_text)r   r  N)r.  r/  r2  iframecanvassvgr;  r<  r  trackmaparearg  >   idr]  r2  r  r  zdata-r  z//*[@class]r  r  r$   )r  method)r#   
HTMLParserlhtmlr  r  	getparentremover  r  r  r  r  rH  rQ   r   r   r  r   rf  rg  itertextr  r  	intdigestr  r   )rv  text_thresholdattr_value_thresholdmax_sizer   treehead_elementsr  rO  r  elementsr   attribs_to_keepattributes_hates_truncater  r  rx  r  clsr|  txtsigr   r  s                           rE   preprocess_html_for_schemar  0  s>   BY!!$$OV< 

8,! 	.D~~+ ''-	.

 " 	8CzzBse*-H# 8$7$$&2%G%%'..w78	8 yy{ 	MG w  "* GO
 )+% w~~2245 c/1V5F5Fw5ONN&&v.#<<W^^TZE[A\_sAs-4^^F-CDYEY-Z]b-bGNN6*c ||GLL$6$6$8 9N J&||113O^DuL ||GLL$6$6$8 9N J&||113O^DuL?	MD #%tzz-01 	!B\\^F~&&/C A{{} 663.C d{v1b! S	'	!x yH v;!)8$u,, Y*-l*;h*F|IX&LXYsL   AL9 AL9 ,B+L9 L9 4BL9 =&L9 %DL9 7L9 9	M!MM!M!c                     	 ddl } ddl m} ddlm}m} ddl}ddl}ddl	}d|j                  d<    |j                  g d      } |j                  d	        |j                  d
g      } |j                  g d      }	 |j                  g d      }
 |j                  d	       |j                  d      } | || ddd             y# t
        $ r t        d      w xY w)zm
    Start virtual display server in Google Colab.
    Raises error if not running in Colab environment.
    r   N)r  )IFramedisplayz6This function must be run in Google Colab environment.:99DISPLAY)Xvfbr  z-screen01280x720x24r   fluxbox)	x11vncz-displayr  z-nopwz-foreverz-sharedz-rfbport5900z-quiet)z/opt/novnc/utils/websockify/run6080zlocalhost:5900z--webz
/opt/novncz#google.colab.kernel.proxyPort(6080)z'/vnc.html?autoconnect=true&resize=scalerp  i   )r3  r  )google.colabr  IPython.displayr  r  ImportErrorr  r   r   r  environPopenr  eval_js)googler  r  r  r   r   r  xvfbr  r  novncr   s               rE   start_colab_display_serverr    s    U'3  !BJJy :JKDDJJqM j	{+G Z < =F J 5 6E DJJqM ..>
?CFcUAB$WZ[\;  USTTUs   C Cc                  Z    ddl m}   |        }t        d       |j                  ddd       y)z8
    Alternative setup using IPython magic commands
    r   )get_ipythonu7   🚀 Setting up Crawl4AI environment in Google Colab...bashrK  ux  
set -e

echo "📦 Installing system dependencies..."
apt-get update -y
apt-get install -y xvfb x11vnc fluxbox websockify git

echo "📥 Setting up virtual display..."
git clone https://github.com/novnc/noVNC         /opt/novnc
git clone https://github.com/novnc/websockify    /opt/novnc/utils/websockify

pip install -q nest_asyncio google-colab
echo "✅ Setup complete!"
N)IPythonr   r  run_cell_magic)r   ipythons     rE   setup_colab_environmentr    s0     $mG	
CD 62 ( r   
page_titleheadlines_textmeta_descriptionc                 p   t               |xs d|xs dddd	 ddlm}  ||      }|j                  j	                         d<   t        fddD              d	<   | xs dd
z   d   z   d
z   d   z   j	                         }t        d |j                         D              d<   S # t        $ r Y S w xY w)a  
    Extract page context for link scoring - called ONCE per page for performance.
    Parser-agnostic function that takes pre-extracted data.
    
    Args:
        page_title: Title of the page
        headlines_text: Combined text from h1, h2, h3 elements
        meta_description: Meta description content
        base_url: Base URL of the page
        
    Returns:
        Dictionary containing page context data for fast link scoring
    rK  F)terms	headlinesr  r   is_docs_siter   )r,   r   c              3   ,   K   | ]  }|d    v   yw)r   Nr   )rS  	indicatorcontexts     rE   rU  z'extract_page_context.<locals>.<genexpr>#  s%      &d(1 '0783D&D &ds   )zdocs.zapi.z
developer.z
reference.r  rP   r  r  c              3   x   K   | ]2  }t        |j                  d             dkD  r|j                  d        4 ywz.,!?;:"()[]{}r   N)rQ   r   rS  r*  s     rE   rU  z'extract_page_context.<locals>.<genexpr>)  s9      F!% #DJJ$? @1 D  $zz/: Fs   8:r
  )r  r\  r,   r   rL  r  rR   r   )r  r  r  r  r,   r   all_textr  s          @rE   extract_page_contextr    s     #)r,2G)(#"MM//1 #& &d5b&d #d  %2,w{/CCcIGTfLggnnp F)1)9F F N	  N	s   BB( (	B54B5	link_text
title_attr
class_attrrel_attrpage_contextc                     d}	 |r!t        |j                               dkD  r|dz  }|xs dj                         t        fddD              r|dz  }t        fdd	D              r|dz  }|xs dj                         t        fd
dD              r|dz  }t        fddD              r|dz  }|j                         t        fddD              r|dz  }nt        fddD              r|dz  }t        fddD              r|dz  }|j	                  d      dz
  }|dk  r|dz  }n
|dkD  r|dz  }|j                  d      r|dz  } rk j                         }t        |      dkD  r|dz  }t        |j                               }	|	dk\  r|dz  }|	dk\  r|dz  }g d}
|j                         |
v r|dz  }|j                  d      rZ rXt        d  j                         D              }|r6t        ||d   z        }|dkD  r |t        t        |      d      z  }||dz  z  }|j                  d d!      r rt         fd"d#D              r|dz  }t        dt        |d$            S # t        $ r d}Y #w xY w)%a  
    Ultra-fast link quality scoring using only provided data (no DOM access needed).
    Parser-agnostic function.
    
    Args:
        link_text: Text content of the link
        url: Link URL
        title_attr: Title attribute of the link
        class_attr: Class attribute of the link
        rel_attr: Rel attribute of the link
        page_context: Pre-computed page context from extract_page_context()
        
    Returns:
        Quality score (0.0 - 10.0), higher is better
    rN      rO   rK  c              3   &   K   | ]  }|v  
 y wr  r   )rS  	nav_class	class_strs     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>T  s     g)yI%g   )navmenuprimarymain	importantg      ?c              3   &   K   | ]  }|v  
 y wr  r   )rS  	bad_classr  s     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>W  s     e)yI%er  )r  sponsorr  promobannerc              3   &   K   | ]  }|v  
 y wr  r   )rS  good_relrel_strs     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>\  s     \xx7"\r  )	canonicalnextprevchapterc              3   &   K   | ]  }|v  
 y wr  r   )rS  bad_relr,  s     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>^  s     Rgw'!Rr  )nofollow	sponsoredugcr  c              3   &   K   | ]  }|v  
 y wr  r   )rS  	good_path	url_lowers     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>e  s     )yI%r  )z/docs/z/api/z/guide/z
/tutorial/z/reference/z/manual/g       @c              3   &   K   | ]  }|v  
 y wr  r   )rS  medium_pathr8  s     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>g  s     ik	)ir  )z/blog/z	/article/z/post/z/news/c              3   &   K   | ]  }|v  
 y wr  r   )rS  bad_pathr8  s     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>k  s     zx9$zr  )z/admin/z/login/z/cart/z
/checkout/z/track/z/click/r  r      ri  rI  )z
click herez	read morez	more infor0  herer
  c              3      K   | ]@  }t        |j                  d             dkD  r!|j                  d       j                          B ywr  )rQ   r   rL  r  s     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>  sB      D#!$**_"=>B "ZZ8>>@ Ds   AAr   
   r  Fc              3   B   K   | ]  }|j                         v   y wr  )rL  )rS  doc_keywordr  s     rE   rU  z1calculate_link_intrinsic_score.<locals>.<genexpr>  s'      !h#. "-	0A!A !hs   )api	referenceguidetutorialexample      $@)rQ   r   rL  r  rx   r  rR   r   r  rh  r   rr   )r  r   r  r  r  r  r  	url_depth
text_cleanrK  generic_texts
link_wordsrI   relevance_ratior  r,  r8  s   `             @@@rE   calculate_link_intrinsic_scorerN  4  s   . ET#j..01A5SLE%2,,.	g7fggSLEe7deeSLE>r((*\3[\\SLER1QRRSLE IIK	 7~SLEi=hiiSLE z5yzzSLE IIcNQ&	>SLE]SLE >>*%SLE "*J:" Z--/0JQQ UM!]2 G$ D'0'8D DJ j<+@@AQ;&-C
OR0H&HO_s22E NE2S !h2f!h h sCt$%%  s   II- -I;:I;intrinsic_scorecontextual_scorescore_links_enabledquery_providedc                     |sy| | nd}||nd}|r|t        dt        |d            S t        |dz  d      }|dz  |dz  z   }t        dt        |d            S )a  
    Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
    
    Args:
        intrinsic_score: Quality score based on URL structure, text, and context (0-10)
        contextual_score: BM25 relevance score based on query and head content (0-1 typically)
        score_links_enabled: Whether link scoring is enabled
        query_provided: Whether a query was provided for contextual scoring
        
    Returns:
        Combined total score (0-10 scale)
        
    Scoring Logic:
        - No scoring: return 5.0 (neutral score)
        - Only intrinsic: return normalized intrinsic score
        - Only contextual: return contextual score scaled to 10
        - Both: weighted combination (70% intrinsic, 30% contextual scaled)
    g      @rN   rH  gffffff?r  )rr   rh  )rO  rP  rQ  rR  	intrinsic
contextualcontextual_scaledtotals           rE   calculate_total_scorerX    s    2  $3#>CI%5%A!sJ -53It,-- J-t4 _!2S!89EsCt$%%r   texts
llm_config
model_name
batch_sizec                   K   ddl }| s |j                  g       S |ddlm} |j	                  dd      }|j	                  d|j	                  d            }|| |j	                  d|j	                  d	            d
}|r||d<   |rd|vrd| |d<    |di | d{   }	g }
|	j
                  D ]  }|
j                  |d            |j                  |
      S 	 ddlm} t        t        d      si t        _        |t        j                  vr ||      t        j                  |<   t        j                  |   }|j                  | |dd      }
|
S 7 # t        $ r t        d      w xY ww)ah  
    Compute embeddings for a list of texts using specified model.
    
    Args:
        texts: List of texts to embed
        llm_config: Optional LLM configuration for API-based embeddings
        model_name: Model name (used when llm_config is None)
        batch_size: Batch size for processing
        
    Returns:
        numpy array of embeddings
    r   N)
aembeddingr  ztext-embedding-3-smallr  api_baser  r  )r  r  r  zopenai/r  	embedding)SentenceTransformerzsentence-transformers is required for local embeddings. Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers_modelsFT)r\  show_progress_barconvert_to_numpyr   )numpyr   r  r^  r   r  rU   sentence_transformersra  r  hasattrget_text_embeddingsrb  r   )rY  rZ  r[  r\  npr^  embedding_modelr_  ru  r   
embeddingsitemra  encoders                 rE   rh  rh    s    $ rxx| & %..5MN>>*jnnZ.HI %!~~k:>>)3LM
 !)F: 	8 ''89F7O $-f-- 
MM 	1Dd;/0	1 rxx
##
	A *I6*,'08886I*6U''
3%--j9 ^^!#!	 $ 

 K .  	l 	s+   BE*E=E*E  A1E*E''E*c                 L    ddl }t        j                  t        | |||            S )z+Synchronous wrapper for get_text_embeddingsr   N)re  r  runrh  )rY  rZ  r[  r\  ri  s        rE   get_text_embeddings_syncrp  .  s"     ;;*5*j*UVVr   vec1vec2c                     ddl } |j                  | |      }|j                  j                  |       |j                  j                  |      z  }|dk7  rt	        ||z        S dS )z/Calculate cosine similarity between two vectorsr   NrN   )re  dotlinalgnormfloat)rq  rr  ri  dot_productnorm_products        rE   cosine_similarityrz  9  sW    "&&t$K99>>$'"))..*>>L0<0A5|+,JsJr   c                      dt        | |      z
  S )z>Calculate cosine distance (1 - similarity) between two vectorsr
   )rz  )rq  rr  s     rE   cosine_distancer|  A  s     t,,,r   c                  f   t        j                         } t        j                         dk(  ra	 t	        j
                  dgdd      }|j                  j                  d      }d}i }|D ]  }d|v r/t        |j                         d   j                  d	            |d
<   6d|v r/t        |j                         d   j                  d	            |d<   id|v r/t        |j                         d   j                  d	            |d<   d|v st        |j                         d   j                  d	            |d<    |j                  d
d      |j                  dd      z   |j                  dd      z   |j                  dd      z   }||z  dz  }|S | j                  dz  S #  | j                  dz  cY S xY w)zDGet truly available memory including inactive pages (cross-platform)rq  vm_statT)capture_outputr   rH  i @  zPages free:r  r  freezPages inactive:inactivezPages speculative:speculativezPages purgeable:	purgeabler   re  )psutilvirtual_memoryr  r  r  ro  stdoutrR   rq   rx  r   	available)vmr   rK  	page_sizepagesr_  total_available_pagesavailable_gbs           rE   get_true_available_memory_gbr  H  s   				 BH$	,^^YK4PFMM''-EIE K D($'

R(8(?(?(D$EE&M&$.(+DJJL,<,C,CC,H(IE*%)T1+.tzz|B/?/F/Fs/K+LE-('4/),TZZ\"-=-D-DS-I)JE+&K 		&!$		*a()		-+, 		+q)* " 2I='JL ||w''	,<<7++s   CF 	BF F0c                      t        j                         } | j                  dz  }t               }d||z
  z  |z  }t	        dt        d|            S )z
    Get memory usage percentage that accounts for platform differences.
    
    Returns:
        float: Memory usage percentage (0-100)
    re  g      Y@rN   )r  r  rW  r  rr   rh  r  total_gbr  used_percents       rE   get_true_memory_usage_percentr  r  sT     
			 Bxx7#H/1L H|34x?L sC|,--r   c                  z    t        j                         } | j                  dz  }t               }t	               }|||fS )z
    Get comprehensive memory statistics.
    
    Returns:
        Tuple[float, float, float]: (used_percent, available_gb, total_gb)
    re  )r  r  rW  r  r  r  s       rE   get_memory_statsr    s=     
			 Bxx7#H/1L02Lx//r   hooksc           	      4   i }| j                         D ]Z  \  }}t        |      st        d| dt        |             	 t	        j
                  |      }t        j                  |      }|||<   \ |S # t        t        f$ r}t        d| d|       d}~ww xY w)a  
    Convert hook function objects to string representations for Docker API.

    This utility simplifies the process of using hooks with the Docker API by converting
    Python function objects into the string format required by the API.

    Args:
        hooks: Dictionary mapping hook point names to Python function objects.
               Functions should be async and follow hook signature requirements.

    Returns:
        Dictionary mapping hook point names to string representations of the functions.

    Example:
        >>> async def my_hook(page, context, **kwargs):
        ...     await page.set_viewport_size({"width": 1920, "height": 1080})
        ...     return page
        >>>
        >>> hooks_dict = {"on_page_context_created": my_hook}
        >>> api_hooks = hooks_to_string(hooks_dict)
        >>> # api_hooks is now ready to use with Docker API

    Raises:
        ValueError: If a hook is not callable or source cannot be extracted
    zHook 'z#' must be a callable function, got z%Cannot extract source code for hook 'zK'. Make sure the function is defined in a file (not interactively). Error: N)
r|  callabler  r2  inspect	getsourcerM  dedentr  	TypeError)r  r   	hook_name	hook_funcr  r  s         rE   hooks_to_stringr    s    4 F % 	9	"vi[0STXYbTcSdeff
	&&y1F__V,F &F9  M # 	7	{ C[[\Z]_ 	s   /A22BBB)g      ?N)r   rO   N)rC  x   TF)Fr  )FNr   r  r   )zgroq/llama3-70b-8192N)FN)r=  )r   )d   r   i )NNFF)Nz&sentence-transformers/all-MiniLM-L6-v2    )r   concurrent.futuresr   r   bs4r   r   r   r   r	   r  r$   r  r;   r   r  r  promptsr   r   r   r   configr   r   r   r   r   r  socketr   pathlibr   typingr   r   r   r   r   r   r   r   r\  r   r  requests.exceptionsr    rf  rM  rU  rY  	functoolsr"   r  r#   r  r   r   urllib.robotparserr%   r   r&   	packagingr'   rK  r(   r)   	itertoolsr*   collectionsr+   r  re  ri  r,   r-   r.   r/   r0   r1   r  r2   
applies_torA   rF   rp   rq   rw  ri   r   r   r   r   r   	bytearrayr'  r)  r  r0  r>   rc  rl  rg  r   r  r  r  r  r  r  r$  r}  r  r  rs  r  r  r  r  r  r+  r4  r;  rD  rM  rP  rW  r_  r  r  r  r  r  r  rO  rd  rh  rs  r  r  r  r  r  r  r  r  r  dictr  rN  rX  ndarrayrh  rp  rz  r|  r  r  r  r  r   r   rE   <module>r     s    ? E E    	 	   *  1 G  G    R R R    -       %   .             ( 	)) . )  "6:T$}T$T$ T$ 	T$
 #S	!123T$ sD$T$r !>:
3->:>: >: 	>:
 >: 
#Y>:B8 82|Z |Z~	i 	 
 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDE 	 	 	
	 	 	 	 	 	 	 	 	 	 	  	 !"	 #$	   	
          	
       	
             !" #$ %& '( )* +, -. /0 12 34 56 78   	
      	
             !" #$ %& '( )* +, -. /0 12 34 56 78  !  !  	
!  !$ 	cCi% 
,! !c !J MM
M M 	M
 M 	M`-*36l23S 3j(*-Z2E E E$<3B %7T{VB !3	_	_
_ _ 	_ 
#s(^_DSlPf:J M~ Nb (84RV 8v>BB GK*	**(+*8;*	*Z@(+ \+0 b
bbJ8t 5$ $N!H-8c -8S -8T#tDcN?S:S5T -8`0 0 0f 3 4 Dbc btCy bJ,^63 63 6:7 :7 :7z3 4S> (P(6s 6p
L$7= 7=rOYb']V6*S *# *QT *`c *hl *Zp&p&	p& p& 	p&
 p& p& p&h (,(, % 	-&e_-&uo-& -& 	-&
 -&f "&>	S9SS S 	S
 ZZSp "&>	W9WW W 	W
 ZZWKBJJ Kbjj KU K-"** -BJJ -5 -'(e '(T.u .$0%ue 34 0 ,4X. ,4S> ,r   