
    /id                       d dl mZmZ d dlZd dlmZmZmZmZm	Z	m
Z
mZ d dlmZmZ d dlZd dlZd dlmZmZ ddlmZmZmZmZmZ ddlmZmZmZmZmZ dd	l  dd
l m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' dd	l( ddl(m)Z) dd	l* ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 d dl2m3Z3 d dl4Z5d dl6Z6d dl7m8Z8 d dl9m:Z:m;Z;  G d de      Z< G d de<      Z= G d de<      Z> G d de<      Z? G d de<      Z@ G d de@      ZA G d de@      ZB G d d e@      ZC G d! d"e@      ZD	  eEeF eGd#            D  ci c]  } | d$vs| d% eH|       d& c} ZI e6j                  d'      ZK e6j                  d(      ZLd)eeMeMf   d*eeMeMf   fd+ZN G d, d-e<      ZOyc c} w ).    )ABCabstractmethodN)AnyListDictOptionalTuplePatternUnion)ThreadPoolExecutoras_completed)IntFlagauto   )PROMPT_EXTRACT_BLOCKS&PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION&PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTIONJSON_SCHEMA_BUILDER_XPATHPROMPT_EXTRACT_INFERRED_SCHEMA)DEFAULT_PROVIDERDEFAULT_PROVIDER_API_KEYCHUNK_TOKEN_THRESHOLDOVERLAP_RATEWORD_TOKEN_RATE)*)sanitize_htmlescape_json_stringperform_completion_with_backoffextract_xml_datasplit_and_parse_json_objectssanitize_input_encodemerge_chunks)
TokenUsage)
get_deviceload_HF_embedding_modelload_text_multilabel_classifiercalculate_batch_size)	LLMConfigcreate_llm_config)partialBeautifulSoup)htmletreec            
           e Zd ZdZddefdZedededeeee	f      fd       Z
dedee   deeee	f      fd	Zdedee   deeee	f      fd
Zy)ExtractionStrategyz<
    Abstract base class for all extraction strategies.
    input_formatc                     || _         d| _        | j                  j                  | _        |j                  dd      | _        y)a  
        Initialize the extraction strategy.

        Args:
            input_format: Content format to use for extraction.
                         Options: "markdown" (default), "html", "fit_markdown"
            **kwargs: Additional keyword arguments
        z<|DEL|>verboseFN)r1   DEL	__class____name__namegetr3   )selfr1   kwargss      X/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/extraction_strategy.py__init__zExtractionStrategy.__init__6   s6     )NN++	zz)U3    urlr-   returnc                      y)z
        Extract meaningful blocks or chunks from the given HTML.

        :param url: The URL of the webpage.
        :param html: The HTML content of the webpage.
        :return: A list of extracted blocks or chunks.
        N r9   r>   r-   qr:   s        r;   extractzExtractionStrategy.extractD   s     	r=   sectionsc           	         g }t               5 }|D cg c]!  } |j                  | j                  ||fi |# }}t        |      D ]!  }	|j	                  |	j                                # 	 ddd       |S c c}w # 1 sw Y   |S xY w)z
        Process sections of text in parallel by default.

        :param url: The URL of the webpage.
        :param sections: List of sections (strings) to process.
        :return: A list of processed JSON blocks.
        N)r   submitrD   r   extendresult)
r9   r>   rE   rC   r:   extracted_contentexecutorsectionfuturesfutures
             r;   runzExtractionStrategy.runO   s     ! 	:X  (  c7EfEG  'w/ :!((9:	: ! 	: ! s   A9&A41A94A99Bc                 j   K   ddl } |j                  | j                  ||g|i | d{   S 7 w)a{  
        Async version: Process sections of text in parallel using asyncio.

        Default implementation runs the sync version in a thread pool.
        Subclasses can override this for true async processing.

        :param url: The URL of the webpage.
        :param sections: List of sections (strings) to process.
        :return: A list of processed JSON blocks.
        r   N)asyncio	to_threadrO   )r9   r>   rE   rC   r:   rQ   s         r;   arunzExtractionStrategy.aruna   s6      	&W&&txxhMMfMMMMs   *313N)markdown)r6   
__module____qualname____doc__strr<   r   r   r   r   rD   rO   rS   rA   r=   r;   r0   r0   1   s    4S 4 3 c Dc3h<P  !s !d3i !$tCQTH~BV !$Nc NT#Y NdSVX[S[nI] Nr=   r0   c            	       ^    e Zd ZdZdededeeeef      fdZdedee   deeeef      fdZ	y)	NoExtractionStrategyz
    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
    r>   r-   r?   c                     d|dgS )zJ
        Extract meaningful blocks or chunks from the given HTML.
        r   )indexcontentrA   rB   s        r;   rD   zNoExtractionStrategy.extractu   s     -..r=   rE   c                 R    t        |      D cg c]  \  }}|g |d c}}S c c}}w )Nr\   tagsr]   )	enumerate)r9   r>   rE   rC   r:   irL   s          r;   rO   zNoExtractionStrategy.run{   s4     (1
7 8
 	
 
s   #N)
r6   rU   rV   rW   rX   r   r   r   rD   rO   rA   r=   r;   rZ   rZ   p   sZ    /3 /c /Dc3h<P /
s 
d3i 
$tCQTH~BV 
r=   rZ   c            
           e Zd ZdZ	 	 	 	 	 	 	 d fd	Z	 ddee   dededee   fdZ	 ddee   fd	Z	ddee   fd
Z
deeee   f   deeee   f   fdZdededeeeef      fdZdedee   deeeef      fdZ xZS )CosineStrategya  
    Extract meaningful blocks or chunks from the given HTML using cosine similarity.

    How it works:
    1. Pre-filter documents using embeddings and semantic_filter.
    2. Perform clustering using cosine similarity.
    3. Organize texts by their cluster labels, retaining order.
    4. Filter clusters by word count.
    5. Extract meaningful blocks or chunks from the filtered clusters.

    Attributes:
        semantic_filter (str): A keyword filter for document filtering.
        word_count_threshold (int): Minimum number of words per cluster.
        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
        linkage_method (str): The linkage method for hierarchical clustering.
        top_k (int): Number of top categories to extract.
        model_name (str): The name of the sentence-transformers model.
        sim_threshold (float): The similarity threshold for clustering.
    c                    t        |   di | ddl}	|| _        || _        || _        || _        || _        || _        t        j                         | _
        |j                  dd      | _         |	j                  g       | _        d| _        t!               | _        t%        | j"                        | _        | j                  r#t)        d| j"                  j*                   d       t-        |      \  | _        | _        | j0                  j3                  | j"                         | j0                  j5                          d| _         |	j                  g       | _        | j                  r#t)        d	| j"                  j*                   d       t7               \  | _        }
| j                  r>t)        d
| dt;        t        j                         | j                  z
        z   dz          yy)a  
        Initialize the strategy with clustering parameters.

        Args:
            semantic_filter (str): A keyword filter for document filtering.
            word_count_threshold (int): Minimum number of words per cluster.
            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
            linkage_method (str): The linkage method for hierarchical clustering.
            top_k (int): Number of top categories to extract.
        r   Nr3   Fdirectz#[LOG] Loading Extraction Model for z device.batchz([LOG] Loading Multilabel Classifier for z[LOG] Model loaded z, models/reuters, took  secondsrA   )superr<   numpysemantic_filterword_count_thresholdmax_distlinkage_methodtop_ksim_thresholdtimetimerr8   r3   arraybuffer_embeddingsget_embedding_methodr$   devicer'   default_batch_sizeprinttyper%   	tokenizermodeltoevalr&   nlprX   )r9   rk   rl   rm   rn   ro   
model_namerp   r:   np_r5   s              r;   r<   zCosineStrategy.__init__   s   * 	"6".$8! ,
*YY[
zz)U3!)"$,! l #7t{{"C<<78H8H7IRS &=Z%H"


dkk"

$+!!)" <<<T[[=M=M<NhWX57! <<%j\1HIdiikDJJ./0 r=   	documentsrk   
at_least_kr?   c                 `   |s|S t        |      |k  rt        |      dz  }ddlm} | j                  |g      d   }| j                  |      } ||g|      j	                         }t        ||      D 	cg c]  \  }}	|	| j                  k\  r||	f }
}}	t        |
      |k  rct        ||      D 	cg c]  \  }}	|	| j                  k  r||	f }}}	|j                  d d       |
j                  |d|t        |
      z
          |
D cg c]  \  }}|	 }
}}|
d| S c c}	}w c c}	}w c c}}w )a  
        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.

        Args:
            documents (List[str]): A list of document texts.
            semantic_filter (str): A keyword filter for document filtering.
            at_least_k (int): The minimum number of documents to return.

        Returns:
            List[str]: A list of filtered and sorted document texts.
           r   )cosine_similarityc                     | d   S )Nr   rA   )xs    r;   <lambda>z<CosineStrategy.filter_documents_embeddings.<locals>.<lambda>"  s
    ad r=   T)keyreverseN)	lensklearn.metrics.pairwiser   get_embeddingsflattenziprp   sortrH   )r9   r   rk   r   r   query_embeddingdocument_embeddingssimilaritiesdocsimfiltered_docsremaining_docsr   s                r;   filter_documents_embeddingsz*CosineStrategy.filter_documents_embeddings   sg    y>J&Y1,J> --.?@C #11)< )2

') 	  	<8
Sd((( #J
 
 }
* !$I| <C+++ c
N 
 NDA  0Q*s=?Q2Q!RS ,99a99[j))'
 :s   4D1D$
D*	sentencesc           	         | j                   j                  dv r"ddl}|| j                  }g }t	        dt        |      |      D ]  }||||z    }| j                  |ddd      }|j                         D 	
ci c]!  \  }	}
|	|
j                  | j                         # }}	}
|j                         5   | j                  d
i |}ddd       j                  j                  d      j                         j                         }|j                  |        t!        j"                  |      | _        | j$                  S | j                   j                  d	k(  ro|| j                  }g }t	        dt        |      |      D ],  }||||z    }| j                  |      }|j                  |       . t!        j"                  |      | _        | j$                  S c c}
}	w # 1 sw Y   xY w)z
        Get BERT embeddings for a list of sentences.

        Args:
            sentences (List[str]): A list of text chunks (sentences).

        Returns:
            NumPy array of embeddings.
        )cpugpucudampsr   NTpt)padding
truncationreturn_tensorsr   )dimr   rA   )rv   ry   torchrw   ranger   rz   itemsr|   no_gradr{   last_hidden_statemeanr   rj   appendr   vstackrt   )r9   r   
batch_sizebypass_bufferr   all_embeddingsrb   batch_sentencesencoded_inputr   tensormodel_output
embeddingss                r;   r   zCosineStrategy.get_embeddings*  s    ;;<< !!44
N1c)nj9 2"+AJ"? $#TdSW !/ ! DQCVCVCX!4?CC4;;//! !
 ]]_ ?#-4::#>#>L? *;;@@Q@GKKMSSU
%%j12" &(YY~%>D" %%% [[&!!44
N1c)nj9 2"+AJ"?!ZZ8
%%j12
 &(YY~%>D"%%%3!
? ?s   7&G0GG 	c                     ddl m}m} ddlm} t        j
                         | _        | j                  |d      } ||d      } ||| j                        } ||| j                  d	      }|S )
z
        Perform hierarchical clustering on sentences and return cluster labels.

        Args:
            sentences (List[str]): A list of text chunks (sentences).

        Returns:
            NumPy array of cluster labels.
        r   )linkagefcluster)pdistT)r   cosine)methoddistance)	criterion)
scipy.cluster.hierarchyr   r   scipy.spatial.distancer   rq   rr   r   rn   rm   )	r9   r   r   r   r   r   distance_matrixlinkedlabelss	            r;   hierarchical_clusteringz&CosineStrategy.hierarchical_clusteringa  sc     	>0YY[
(($(G
  
H51D1DE&$--:Fr=   clustersc                     i }|j                         D ]D  \  }}dj                  |      }t        |j                               }|| j                  k\  s@|||<   F |S )a  
        Filter clusters to remove those with a word count below the threshold.

        Args:
            clusters (Dict[int, List[str]]): Dictionary of clusters.

        Returns:
            Dict[int, List[str]]: Filtered dictionary of clusters.
         )r   joinr   splitrl   )r9   r   filtered_clusters
cluster_idtexts	full_text
word_counts          r;   filter_clusters_by_word_countz,CosineStrategy.filter_clusters_by_word_countz  sf     !)!1 	6JIY__./J T66605!*-	6 ! r=   r>   r-   c           	      V   t        j                          } |j                  | j                        }| j                  || j                        }|sg S | j                  |      }t        j                          }i }t        |      D ])  \  }	}
|j                  |
g       j                  ||	          + | j                  |      }t        |      D cg c]#  }t        |      g dj                  ||         d% }}| j                  rt        d| j                          | j                  j                   dv r;| j#                  |D cg c]  }|d   	 c}      }t%        ||      D ]
  \  }}
|
|d<    | j                  r%t        dt        j                          |z
  dd	       |S c c}w c c}w )
a  
        Extract clusters from HTML content using hierarchical clustering.

        Args:
            url (str): The URL of the webpage.
            html (str): The HTML content of the webpage.

        Returns:
            List[Dict[str, Any]]: A list of processed JSON blocks.
        r   r_   u   [LOG] 🚀 Assign tags using )r   r   r   r   r]   r`   u"   [LOG] 🚀 Categorization done in z.2frh   )rq   r   r4   r   rk   r   ra   
setdefaultr   r   sortedintr   r3   rx   rv   ry   r~   r   )r9   r>   r-   rC   r:   ttext_chunksr   r   r\   labelr   idxcluster_listclusters                  r;   rD   zCosineStrategy.extract  s    IIK djj* 66--
 I --k: IIK%f- 	FLE5r*11+e2DE	F !>>xH
 /0
 #hsxx@QRU@V7WX
 

 <<1$++?@;;<<XXNgwy1NOF"%lF"; ("'(& <<6tyy{Qs6K8TUC
 Os   (F!F&rE   c                 \     | j                   || j                  j                  |      fi |S )z
        Process sections using hierarchical clustering.

        Args:
            url (str): The URL of the webpage.
            sections (List[str]): List of sections (strings) to process.

        Returns:
        )rD   r4   r   )r9   r>   rE   rC   r:   s        r;   rO   zCosineStrategy.run  s)     t||Cx!8CFCCr=   )N
   g?ward   z&sentence-transformers/all-MiniLM-L6-v2g333333?)   )NFN)r6   rU   rV   rW   r<   r   rX   r   r   r   r   r   r   r   rD   rO   __classcell__r5   s   @r;   rd   rd      s   , ;Tn MO6*c6*586*FI6*	c6*r DI5&c5&nc 2!S$s)^,!	c49n	!2F3 Fc FDc3h<P FPDs Dd3i D$tCQTH~BV Dr=   rd   c                   @    e Zd ZdZdddddZddddeeedd	d
d
edddfddde	de
de	de	dee	   de	de	f fdZ fdZde	dede	dee
e	ef      fdZdee	   fdZde	dee	   dee
e	ef      fdZde	dede	dee
e	ef      fdZde	dee	   dee
e	ef      fdZd!d Z xZS )"LLMExtractionStrategya  
    A strategy that uses an LLM to extract meaningful content from the HTML.

    Attributes:
        llm_config: The LLM configuration object.
        instruction: The instruction to use for the LLM model.
        schema: Pydantic model schema for structured data.
        extraction_type: "block" or "schema".
        chunk_token_threshold: Maximum tokens per chunk.
        overlap_rate: Overlap between chunks.
        word_token_rate: Word to token conversion rate.
        apply_chunking: Whether to apply chunking.
        verbose: Whether to print verbose output.
        usages: List of individual token usages.
        total_usage: Accumulated token usage.
    1Instead, use llm_config=LLMConfig(provider="...")2Instead, use llm_config=LlMConfig(api_token="...")z1Instead, use llm_config=LLMConfig(base_url="..."))provider	api_tokenbase_urlapi_baseNblockTrT   F
llm_configr(   instructionschemar1   r   r   r   r   c                    t        |   dd|	i| || _        | j                  s7t        t        t
        j                  j                  t                    | _        || _	        || _
        || _        |rd| _
        |
| _        |xs t        | _        || _        || _        || _        |j                  di       | _        | j"                  sd| _        || _        g | _        t+               | _        || _        || _        || _        || _        y)a  
        Initialize the strategy with clustering parameters.

        Args:
            llm_config: The LLM configuration object.
            instruction: The instruction to use for the LLM model.
            schema: Pydantic model schema for structured data.
            extraction_type: "block" or "schema".
            chunk_token_threshold: Maximum tokens per chunk.
            overlap_rate: Overlap between chunks.
            word_token_rate: Word to token conversion rate.
            apply_chunking: Whether to apply chunking.
            input_format: Content format to use for extraction.
                            Options: "markdown" (default), "html", "fit_markdown"
            force_json_response: Whether to force a JSON response from the LLM.
            verbose: Whether to print verbose output.

            # Deprecated arguments, will be removed very soon
            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
            api_token: The API token for the provider.
            base_url: The base URL for the API request.
            api_base: The base URL for the API request.
            extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc.
        r1   r   r   r   
extra_argsg    eANrA   )ri   r<   r   r)   r   osenvironr8   r   r   extract_typer   force_json_responser   chunk_token_thresholdoverlap_rateword_token_rateapply_chunkingr   r3   usagesr#   total_usager   r   r   r   )r9   r   r   r   extraction_typer   r   r   r   r1   r   r3   r   r   r   r   r:   r5   s                    r;   r<   zLLMExtractionStrategy.__init__  s    X 	>|>v>$/)**..)ABDO '+ (D#6 %:%S>S"(., **\26""),D&%< "  r=   c                     t        j                  | j                        }|j                  }|| j                  v r/|||   j
                  urt        d| d| j                  |          t        | !  ||       y)zHandle attribute setting.	Setting '' is deprecated. N)	inspect	signaturer<   
parameters_UNWANTED_PROPSdefaultAttributeErrorri   __setattr__)r9   r7   valuesig
all_paramsr5   s        r;   r  z!LLMExtractionStrategy.__setattr__N  sw     .^^
4'''ED9I9Q9Q,Q 9TF2CDDXDXY]D^C_!`aaD%(r=   r>   ixr-   r?   c                 	   | j                   rt        d| d|        |t        t        |            d}t        }| j
                  r| j
                  |d<   t        }| j                  dk(  r6| j                  r*t        j                  | j                  d      |d<   t        }| j                  dk(  r| j                  st        }|D ]  }|j                  d	|z   d
z   ||         } 	 t        | j                  j                   || j                  j"                  | j                  j$                  | j&                  | j(                  | j                  j*                  | j                  j,                  | j                  j.                  	      }t1        |j2                  j4                  |j2                  j6                  |j2                  j8                  |j2                  j:                  r |j2                  j:                  j<                  ni |j2                  j>                  r |j2                  j>                  j<                  ni       }| j@                  jC                  |       | jD                  xj4                  |j4                  z  c_        | jD                  xj6                  |j6                  z  c_        | jD                  xj8                  |j8                  z  c_        	 |jF                  d   jH                  jJ                  }	d}
| j&                  rt        jL                  |	      }
tO        |
tP              rYtS        |
      dk(  rGtO        tU        |
jW                               d   tT              rtU        |
jW                               d   }
n<|
g}
n8tO        |
tT              r(|
}
n%tY        dg|	      d   }
t        jL                  |
      }
|
D ]  }d|d<   	 	 | j                   rt        dtS        |
      d|d|       |
S # tZ        $ rM t]        |jF                  d   jH                  jJ                        \  }}|}
|r|
jC                  dddg|d       Y |w xY w# tZ        $ r6}| j                   rt        d|        |ddgt_        |      dgcY d}~S d}~ww xY w)a  
        Extract meaningful blocks or chunks from the given HTML using an LLM.

        How it works:
        1. Construct a prompt with variables.
        2. Make a request to the LLM using the prompt.
        3. Parse the response and extract blocks or chunks.

        Args:
            url: The URL of the webpage.
            ix: Index of the block.
            html: The HTML content of the webpage.

        Returns:
            A list of extracted blocks or chunks.
        [LOG] Call LLM for  - block index: URLHTMLREQUESTr   r   indentSCHEMA{}r   json_responser   
base_delaymax_attemptsexponential_factorcompletion_tokensprompt_tokenstotal_tokenscompletion_tokens_detailsprompt_tokens_detailsr   Nr   blocksFerrorTr\   r   r`   r]   [LOG] Extractedblocks from URL:block index:[LOG] Error in LLM extraction: )0r3   rx   r   r   r   r   r   r   r   jsondumpsr   r   replacer   r   r   r   r   r   r   backoff_base_delaybackoff_max_attemptsbackoff_exponential_factorr#   usager  r  r  r  __dict__r  r   r   r   choicesmessager]   loads
isinstancedictr   listvaluesr   	Exceptionr    rX   )r9   r>   r  r-   variable_valuesprompt_with_variablesvariableresponser,  r]   r  r   parsedunparsedes                  r;   rD   zLLMExtractionStrategy.extractY  s   " <<'u,<RDAB &}T':;

 !6)-)9)9OI&$J!(T[[(,

4;;q(IOH%$J!($B!' 	H$9$A$Ah$oh&?%!	
U	6((%))11"66????==!__AA#'??#M#M
H "*.."B"B&nn::%^^88>>;; +3..*R*R*[*[>>77 '/nn&J&J&S&S
E KKu% ..%2I2II.**e.A.AA*))U-?-??)"**1-55==++!ZZ0F!&$/v;!+
4;PQR;SUY0Z%)&--/%:1%=F '-XF#FD1!' .xj'B8LF!ZZ/F# +E%*E'N+ ||%K&" M'  #?$$Q'//77$   MM"#dWIRZ[(  	||7s;<  !$I"1v	 		sE   %G.Q C3O/ 'Q /AQQ QQ 	R+R<RRc                 8    t        |||| j                        }|S )z[
        Merge documents into sections based on chunk_token_threshold and overlap.
        )docstarget_sizeoverlapword_token_ratio)r"   r   )r9   r   r   r@  rE   s        r;   _mergezLLMExtractionStrategy._merge  s'     !.!11	
 r=   rE   c                 ~   | j                  || j                  t        | j                  | j                  z              }g }| j                  j
                  j                  d      rat        |      D ]Q  \  }}t        | j                  |      }|j                   ||t        |                   t        j                  d       S |S t        d      5 }t        | j                  |      }t        |      D cg c]!  \  }}|j                  ||t        |            # }	}}t!        |	      D ]"  }
	 |j                  |
j#                                $ 	 ddd       |S c c}}w # t$        $ rD}| j&                  rt)        d|        |j+                  ddd	gt-        |      d
       Y d}~~d}~ww xY w# 1 sw Y   |S xY w)a7  
        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.

        Args:
            url: The URL of the webpage.
            sections: List of sections (strings) to process.

        Returns:
            A list of extracted blocks or chunks.
        r@  zgroq/g      ?   )max_workerszError in thread execution: r   Tr   r!  N)rB  r   r   r   r   r   
startswithra   r*   rD   rH   r!   rq   sleepr   rG   r   rI   r5  r3   rx   r   rX   )r9   r>   rE   merged_sectionsrJ   r  rL   extract_funcrK   rM   rN   r<  s               r;   rO   zLLMExtractionStrategy.run  s    ++&&22T5F5FFG & 

 ??##..w7(9  G&t||S9!(( %:7%CD 

3 F ! / $2 h&t||S9 (1'A#G OOL"6KG6TU 
 +73 F)00A. ! + % <<!$?s"CD)00)*)-)0	+.q6	 	. ! sH   %F28&EF20E"F2F2"	F/+:F*%F2*F//F22F<c                 8	  K   ddl m} | j                  rt        d| d|        |t	        t        |            d}t        }| j                  r| j                  |d<   t        }| j                  dk(  r6| j                  r*t        j                  | j                  d	      |d
<   t        }| j                  dk(  r| j                  st        }|D ]  }|j                  d|z   dz   ||         } 	  || j                   j"                  || j                   j$                  | j                   j&                  | j(                  | j*                  | j                   j,                  | j                   j.                  | j                   j0                  	       d{   }t3        |j4                  j6                  |j4                  j8                  |j4                  j:                  |j4                  j<                  r |j4                  j<                  j>                  ni |j4                  j@                  r |j4                  j@                  j>                  ni       }	| jB                  jE                  |	       | jF                  xj6                  |	j6                  z  c_        | jF                  xj8                  |	j8                  z  c_        | jF                  xj:                  |	j:                  z  c_        	 |jH                  d   jJ                  jL                  }
d}| j(                  rt        jN                  |
      }tQ        |tR              rYtU        |      dk(  rGtQ        tW        |jY                               d   tV              rtW        |jY                               d   }n<|g}n8tQ        |tV              r(|}n%t[        dg|
      d   }t        jN                  |      }|D ]  }d|d<   	 	 | j                  rt        dtU        |      d|d|       |S 7 o# t\        $ rM t_        |jH                  d   jJ                  jL                        \  }}|}|r|jE                  dddg|d       Y w xY w# t\        $ r6}| j                  rt        d|        |ddgta        |      dgcY d}~S d}~ww xY ww)a  
        Async version: Extract meaningful blocks or chunks from the given HTML using an LLM.

        How it works:
        1. Construct a prompt with variables.
        2. Make an async request to the LLM using the prompt.
        3. Parse the response and extract blocks or chunks.

        Args:
            url: The URL of the webpage.
            ix: Index of the block.
            html: The HTML content of the webpage.

        Returns:
            A list of extracted blocks or chunks.
        r    aperform_completion_with_backoffr	  r
  r  r  r   r   r  r  r  r  r  Nr  r   r  Fr   Tr!  r"  r#  r$  r%  )1utilsrM  r3   rx   r   r   r   r   r   r   r   r&  r'  r   r   r(  r   r   r   r   r   r   r)  r*  r+  r#   r,  r  r  r  r  r-  r  r   r   r   r.  r/  r]   r0  r1  r2  r   r3  r4  r   r5  r    rX   )r9   r>   r  r-   rM  r6  r7  r8  r9  r,  r]   r  r   r:  r;  r<  s                   r;   aextractzLLMExtractionStrategy.aextract   s    " 	<<<'u,<RDAB &}T':;

 !6)-)9)9OI&$J!(T[[(,

4;;q(IOH%$J!($B!' 	H$9$A$Ah$oh&?%!	
P	=((%))11"66????==!__AA#'??#M#M
 
H "*.."B"B&nn::%^^88>>;; +3..*R*R*[*[>>77 '/nn&J&J&S&S
E KKu% ..%2I2II.**e.A.AA*))U-?-??)"**1-55==++!ZZ0F!&$/v;!+
4;PQR;SUY0Z%)&--/%:1%=F&,XF#FD1!'-xj'B8LF!ZZ/F# +E%*E'N+ ||%K&" MI
b  #?$$Q'//77$   MM"#dWIRZ[(  
	||7s;<  !$I"1v	 
	sp   C*R-B Q O<EQ !C3O? 'Q ;R<Q ?AQQ QQ 	R!+RRRRRc                   K   ddl }| j                  || j                  t        | j                  | j                  z              }g }t        |      D cg c]!  \  }}| j                  ||t        |            # }}} |j                  |ddi d{   }	|	D ]^  }
t        |
t              r;| j                  rt        d|
        |j                  dddgt        |
      d       N|j                  |
       ` |S c c}}w 7 ow)	a  
        Async version: Process sections with true parallelism using asyncio.gather.

        Args:
            url: The URL of the webpage.
            sections: List of sections (strings) to process.

        Returns:
            A list of extracted blocks or chunks.
        r   NrD  return_exceptionsTzError in async extraction: r   r!  )rQ   rB  r   r   r   ra   rO  r!   gatherr1  r5  r3   rx   r   rX   rH   )r9   r>   rE   rQ   rI  rJ   r  rL   tasksresultsrI   s              r;   rS   zLLMExtractionStrategy.arun  s     	++&&22T5F5FFG & 
 
  )9
G MM#r#8#AB
 
 'FFF  	1F&),<<7x@A!((!"!%!(	#&v;	 "((0	1 ! 1
 Gs   AD&C=;DDA0Dc                 D   t        d       t        ddddd       t        d       t        ddd| j                  j                  d	       t        d
dd| j                  j                  d	       t        ddd| j                  j                  d	       t        d       t        ddddddd
dddd       t        d       t        | j                  d      D ]=  \  }}t        |dd|j                  d	d|j                  d	d|j                  d	       ? y)zHPrint a detailed token usage report showing total and per-request usage.z
=== Token Usage Summary ===Typez<15r   Countz>12z------------------------------
Completionz>12,PromptTotalz
=== Usage History ===z	Request #z<10z0------------------------------------------------r   N)rx   r   r  r  r  ra   r   )r9   rb   r,  s      r;   
show_usagez LLMExtractionStrategy.show_usage  s/   -.Agc]+,hc"!D$4$4$F$Ft#LMN#a 0 0 > >tDEFQt//<<TBCD'(S!<"4Ahs^1WSMRSh!$++q1 	HAuS'52248%:M:Md9SSTUZUgUghlTmn	r=   )r?   N)r6   rU   rV   rW   r   r   r   r   r   rX   r   r   r<   r  r   r   r   rD   rB  rO   rO  rS   r[  r   r   s   @r;   r   r     s   " MNLL	
O #'3!'&!(#'#G!G! G! 	G! G! G! C=G!  !G!" #G!T	)@3 @C @s @tDcN7K @D
49 
7!s 7!d3i 7!Dc3h4H 7!r|# |3 |c |d4S>>R ||.!c .!T#Y .!4S#X;O .!`r=   r   c                       e Zd ZdZdZdeeef   f fdZdedede	eeef      fdZ
edefd	       Zed
efd       Zed
efd       Zd Zd Zd Zd Zd Zd Zdede	e   de	eeef      fdZedefd       Zedefd       Zedefd       ZdddZed)dededed edef
d!       Zed"dd e       ddfdededed ed#d$d%ed&edefd'       Ze	 	 	 	 d*dededed ed#d$defd(       Z xZ S )+JsonElementExtractionStrategya  
    Abstract base class for extracting structured JSON from HTML content.

    How it works:
    1. Parses HTML content using the `_parse_html` method.
    2. Uses a schema to define base selectors, fields, and transformations.
    3. Extracts data hierarchically, supporting nested fields and lists.
    4. Handles computed fields with expressions or functions.

    Attributes:
        DEL (str): Delimiter used to combine HTML sections. Defaults to '
'.
        schema (Dict[str, Any]): The schema defining the extraction rules.
        verbose (bool): Enables verbose logging for debugging purposes.

    Methods:
        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
        _extract_item(element, fields): Extracts fields from a single element.
        _extract_single_field(element, field): Extracts a single field based on its type.
        _apply_transform(value, transform): Applies a transformation to a value.
        _compute_field(item, field): Computes a field value using an expression or function.
        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.

    Abstract Methods:
        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
        _get_elements(element, selector): Retrieves child elements using a selector.
        _get_element_text(element): Extracts text content from an element.
        _get_element_html(element): Extracts raw HTML from an element.
        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
    
r   c                 `    t        |   di | || _        |j                  dd      | _        y)z
        Initialize the JSON element extraction strategy with a schema.

        Args:
            schema (Dict[str, Any]): The schema defining the extraction rules.
        r3   FNrA   )ri   r<   r   r8   r3   r9   r   r:   r5   s      r;   r<   z&JsonElementExtractionStrategy.__init__  s.     	"6"zz)U3r=   r>   html_contentr?   c                    | j                  |      }| j                  || j                  d         }g }|D ]  }i }	d| j                  v r1| j                  d   D ]  }
| j                  ||
      }|||	|
d   <   ! | j	                  || j                  d         }|	j                  |       |	sw|j                  |	        |S )a  
        Extract structured data from HTML content.

        How it works:
        1. Parses the HTML content using the `_parse_html` method.
        2. Identifies base elements using the schema's base selector.
        3. Extracts fields from each base element using `_extract_item`.

        Args:
            url (str): The URL of the page being processed.
            html_content (str): The raw HTML content to parse and extract.
            *q: Additional positional arguments.
            **kwargs: Additional keyword arguments for custom extraction.

        Returns:
            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
        baseSelector
baseFieldsr7   fields)_parse_html_get_base_elementsr   _extract_single_field_extract_itemupdater   )r9   r>   ra  rC   r:   parsed_htmlbase_elementsrT  elementitemfieldr  
field_datas                r;   rD   z%JsonElementExtractionStrategy.extract  s    * &&|4//^4
 $ 	%GDt{{*![[6 4E 66wFE(.3U6]+4 ++GT[[5JKJKK
#t$	%  r=   c                      y)z*Parse HTML content into appropriate formatNrA   r9   ra  s     r;   rf  z)JsonElementExtractionStrategy._parse_html=       	r=   selectorc                      y)(Get all base elements using the selectorNrA   r9   rk  rt  s      r;   rg  z0JsonElementExtractionStrategy._get_base_elementsB  rs  r=   c                      y)z%Get child elements using the selectorNrA   r9   rm  rt  s      r;   _get_elementsz+JsonElementExtractionStrategy._get_elementsG  rs  r=   c           	      Z   	 |d   dk(  r7| j                  ||d         }|r|d   nd }|r| j                  ||d         S i S |d   dk(  r8| j                  ||d         }|D cg c]  }| j                  ||d          c}S |d   dk(  r8| j                  ||d         }|D cg c]  }| j                  ||d          c}S | j                  ||      S c c}w c c}w # t        $ rD}| j
                  rt        d|d	    d
t        |              |j                  d      cY d }~S d }~ww xY w)Nry   nestedrt  r   re  r3  nested_listzError extracting field r7   : r  )	rz  ri  _extract_list_itemrh  r5  r3   rx   rX   r8   )r9   rm  ro  nested_elementsnested_elementelementselr<  s           r;   _extract_fieldz,JsonElementExtractionStrategy._extract_fieldL  sS   	(V}("&"4"4WeJ>O"P7F!3D & &&~uXG  V}&--guZ7HIOWX//E(ODXXV}---guZ7HIJRSB**2uX?SS--gu== Y T  	(||/fbQIJ99Y''	(sR   <C C !C "C>C !C "C>C C 
C 	D*&9D%D*%D*c                    d|v r.| j                  ||d         }|s|j                  d      S |d   }n|}d}|d   dk(  r| j                  |      }n|d   dk(  r| j                  ||d         }na|d   dk(  r| j	                  |      }nG|d   d	k(  r?| j                  |      }t        j                  |d
   |      }|r|j                  d      nd}d|v r| j                  ||d         }||S |j                  d      S )a  
        Extract a single field based on its type.

        How it works:
        1. Selects the target element using the field's selector.
        2. Extracts the field value based on its type (e.g., text, attribute, regex).
        3. Applies transformations if defined in the schema.

        Args:
            element: The base element to extract the field from.
            field (Dict[str, Any]): The field definition in the schema.

        Returns:
            Any: The extracted field value.
        rt  r  r   Nry   text	attributer-   regexpatternr   	transform)	rz  r8   _get_element_text_get_element_attribute_get_element_htmlresearchgroup_apply_transform)r9   rm  ro  selectedr  r  matchs          r;   rh  z3JsonElementExtractionStrategy._extract_single_fielde  s   " ))'53DEHyy++{HH=F"**84E6]k)//%:LME6]f$**84E6]g%))(3DIIeI.5E&+EKKNE%))%{1CDE)uCuyy/CCr=   c                 R    i }|D ]  }| j                  ||      }||||d   <   ! |S )Nr7   )rh  r9   rm  re  rn  ro  r  s         r;   r  z0JsonElementExtractionStrategy._extract_list_item  sC     	,E..w>E &+U6]#	, r=   c                     i }|D ]:  }|d   dk(  r| j                  ||      }n| j                  ||      }|3|||d   <   < |S )a   
        Extracts fields from a given element.

        How it works:
        1. Iterates through the fields defined in the schema.
        2. Handles computed, single, and nested field types.
        3. Updates the item dictionary with extracted field values.

        Args:
            element: The base element to extract fields from.
            fields (List[Dict[str, Any]]): The list of fields to extract.

        Returns:
            Dict[str, Any]: A dictionary representing the extracted item.
        ry   computedr7   )_compute_fieldr  r  s         r;   ri  z+JsonElementExtractionStrategy._extract_item  sc    "  	,EV}
*++D%8++GU; &+U6]#	, r=   c                     |dk(  r|j                         S |dk(  r|j                         S |dk(  r|j                         S |S )a  
        Apply a transformation to a value.

        How it works:
        1. Checks the transformation type (e.g., `lowercase`, `strip`).
        2. Applies the transformation to the value.
        3. Returns the transformed value.

        Args:
            value (str): The value to transform.
            transform (str): The type of transformation to apply.

        Returns:
            str: The transformed value.
        	lowercase	uppercasestrip)lowerupperr  )r9   r  r  s      r;   r  z.JsonElementExtractionStrategy._apply_transform  sE    " #;;= +%;;= '!;;= r=   c           	          	 d|v rt        |d   i |      S d|v r |d   |      S y # t        $ rD}| j                  rt        d|d    dt	        |              |j                  d      cY d }~S d }~ww xY w)N
expressionfunctionzError computing field r7   r~  r  )r}   r5  r3   rx   rX   r8   )r9   rn  ro  r<  s       r;   r  z,JsonElementExtractionStrategy._compute_field  s    	(u$E,/T::u$(uZ(.. % 	(||.uV}oRAxHI99Y''	(s   & & 	A39A.(A3.A3rE   c                 `    | j                   j                  |      } | j                  ||fi |S )a4  
        Run the extraction strategy on a combined HTML content.

        How it works:
        1. Combines multiple HTML sections using the `DEL` delimiter.
        2. Calls the `extract` method with the combined HTML.

        Args:
            url (str): The URL of the page being processed.
            sections (List[str]): A list of HTML sections.
            *q: Additional positional arguments.
            **kwargs: Additional keyword arguments for custom extraction.

        Returns:
            List[Dict[str, Any]]: A list of extracted items.
        )r4   r   rD   )r9   r>   rE   rC   r:   combined_htmls         r;   rO   z!JsonElementExtractionStrategy.run  s.    $ h/t||C9&99r=   c                      y)zGet text content from elementNrA   r9   rm  s     r;   r  z/JsonElementExtractionStrategy._get_element_text  rs  r=   c                      y)zGet HTML content from elementNrA   r  s     r;   r  z/JsonElementExtractionStrategy._get_element_html  rs  r=   r  c                      y)z Get attribute value from elementNrA   r9   rm  r  s      r;   r  z4JsonElementExtractionStrategy._get_element_attribute  rs  r=   r   r   r   Nr-   schema_typequerytarget_json_examplec                     ddl m} |dk(  r|nt        }d| }d|  d}|r|d| z  }|r	|d| d	z  }|r|s|d
z  }n|s|r|dz  }n	|s|s|dz  }|dz  }dj                  ||g      S )z
        Build the prompt for schema generation. Shared by sync and async methods.

        Returns:
            str: Combined system and user prompt
        r   )JSON_SCHEMA_BUILDERCSSa#  You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.

Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.

# Schema main keys:
- name: This is the name of the schema.
- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
- baseFields: This is a list of fields that you extract from the base element itself.
- fields: This is a list of fields that you extract from the children of the base element. {name, selector, type} based on the type, you may have extra keys such as "attribute" when the type is "attribute".

# Extra Context:
In this context, the following items may or may not be present:
- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.

# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.

# What are the instructions and details for this schema generation?
zJ
                HTML to analyze:
                ```html
                z%
                ```
                z4

## Query or explanation of target/goal data item:
z,

## Example of target JSON object:
```json
z
```a  IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema..a  IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority.zIMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content.a  IMPORTANT:
        0/ Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
        1/ DO NOT USE use base64 kind of classes, they are temporary and not reliable.
        2/ Every selector must refer to only one unique element. You should ensure your selector points to a single element and is unique to the place that contains the information. You have to use available techniques based on CSS or XPATH requested schema to make sure your selector is unique and also not fragile, meaning if we reload the page now or in the future, the selector should remain reliable.
        3/ Do not use Regex as much as possible.

        Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
        

)promptsr  r   r   )r-   r  r  r  r  prompt_templatesystem_contentuser_contents           r;   _build_schema_promptz2JsonElementExtractionStrategy._build_schema_prompt  s     	11<1E-Kd(  ),   UV[U\]]LNObNcchiiL,  {  {L.  }  }L2  @  @L  	 {{NL9::r=   r  r   r(   r   r   c                    ddl m} t        j                  j	                         D ]#  \  }	}
t               |	   t        d|	 d|
        t        j                  | |||      }	  ||j                  |d|j                  |j                  |      }t        j                  |j                  d   j                  j                        S # t         $ r}t!        d	t#        |             d}~ww xY w)
ak  
        Generate extraction schema from HTML content and optional query (sync version).

        Args:
            html (str): The HTML content to analyze
            query (str, optional): Natural language description of what data to extract
            provider (str): Legacy Parameter. LLM provider to use
            api_token (str): Legacy Parameter. API token for LLM provider
            llm_config (LLMConfig): LLM configuration object
            **kwargs: Additional args passed to LLM processor

        Returns:
            dict: Generated schema following the JsonElementExtractionStrategy format
        r   )r   Nr   r   Tr   r7  r  r   r   r   r   Failed to generate schema: )rN  r   r]  _GENERATE_SCHEMA_UNWANTED_PROPSr   localsr  r  r   r   r   r&  r0  r.  r/  r]   r5  rX   )r-   r  r  r  r   r   r   r:   r   r7   r/  promptr9  r<  s                 r;   generate_schemaz-JsonElementExtractionStrategy.generate_schema?  s    2 	;:ZZ``b 	SMD'x~)$y6Gy%QRR	S /CCD+W\^qr	D6#,,&,"$..#,,!H ::h..q199AABB 	D9#a&BCC	Ds   $A!C 	C+C&&C+c                   K   ddl m} |
t               }t        j	                  | |||      }	  ||j
                  |d|j                  |j                  |       d{   }t        j                  |j                  d   j                  j                        S 7 :# t        $ r}	t        dt        |	             d}	~	ww xY ww)a	  
        Generate extraction schema from HTML content (async version).

        Use this method when calling from async contexts (e.g., FastAPI) to avoid
        issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
        async execution.

        Args:
            html (str): The HTML content to analyze
            schema_type (str): "CSS" or "XPATH"
            query (str, optional): Natural language description of what data to extract
            target_json_example (str, optional): Example of desired JSON output
            llm_config (LLMConfig): LLM configuration object
            **kwargs: Additional args passed to LLM processor

        Returns:
            dict: Generated schema following the JsonElementExtractionStrategy format
        r   rL  NTr  r   r  )rN  rM  r)   r]  r  r   r   r   r&  r0  r.  r/  r]   r5  rX   )
r-   r  r  r  r   r:   rM  r  r9  r<  s
             r;   agenerate_schemaz.JsonElementExtractionStrategy.agenerate_schemam  s     6 	<*,J.CCD+W\^qr	D=#,,&,"$..#,,! H ::h..q199AABB  	D9#a&BCC	Ds:   +C/B B9B CB 	B?#B::B??C)NN)r  NNN)!r6   rU   rV   rW   r4   r   rX   r   r<   r   rD   r   rf  rg  rz  r  rh  r  ri  r  r  rO   r  r  r  r  staticmethodr  r)   r2  r  r  r   r   s   @r;   r]  r]    sk   > C	4tCH~ 	4++&)+	d38n	+Z       s  (2(DT62	(:s :d3i :$tCQTH~BV :* C   C     
 HI'#
 =;3 =;S =; =;be =;qt =; =;~  !#'"3"5+D+D+D +D !	+D
  +D +D +D 
+D +DZ  !#'"&,D,D,D ,D !	,D
  ,D 
,D ,Dr=   r]  c                   z     e Zd ZdZdeeef   f fdZdefdZdefdZ	defdZ
d	efd
Zd	efdZdefdZ xZS )JsonCssExtractionStrategya  
    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.

    How it works:
    1. Parses HTML content with BeautifulSoup.
    2. Selects elements using CSS selectors defined in the schema.
    3. Extracts field data and applies transformations as defined.

    Attributes:
        schema (Dict[str, Any]): The schema defining the extraction rules.
        verbose (bool): Enables verbose logging for debugging purposes.

    Methods:
        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
        _get_elements(element, selector): Selects child elements using a CSS selector.
        _get_element_text(element): Extracts text content from a BeautifulSoup element.
        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
    r   c                 0    d|d<   t        |   |fi | y Nr-   r1   ri   r<   r`  s      r;   r<   z"JsonCssExtractionStrategy.__init__      !'~*6*r=   ra  c                     t        |d      S )Nlxmlr+   rr  s     r;   rf  z%JsonCssExtractionStrategy._parse_html  s    \622r=   rt  c                 $    |j                  |      S r   selectrw  s      r;   rg  z,JsonCssExtractionStrategy._get_base_elements  s    !!(++r=   c                 $    |j                  |      S r   r  ry  s      r;   rz  z'JsonCssExtractionStrategy._get_elements  s     ~~h''r=   r?   c                 &    |j                  d      S )NTr  )get_textr  s     r;   r  z+JsonCssExtractionStrategy._get_element_text  s    d++r=   c                     t        |      S r   )rX   r  s     r;   r  z+JsonCssExtractionStrategy._get_element_html  s    7|r=   r  c                 $    |j                  |      S r   r8   r  s      r;   r  z0JsonCssExtractionStrategy._get_element_attribute      {{9%%r=   )r6   rU   rV   rW   r   rX   r   r<   rf  rg  rz  r  r  r  r   r   s   @r;   r  r    sd    *+tCH~ +3 3, ,(s (
,C ,C & &r=   r  c                        e Zd Zdeeef   f fdZdefdZd Zd Z	d Z
d Zd	 Zd
 ZdefdZdefdZdefdZdefdZdefdZd Z xZS )JsonLxmlExtractionStrategyr   c                     d|d<   t        |   |fi | i | _        i | _        i | _        |j                  dd      | _        |j                  dd      | _        ddlm	}m
} ddlm} || _	        || _        || _        y )	Nr-   r1   use_cachingToptimize_common_patternsr   )r.   r-   CSSSelector)ri   r<   _selector_cache_xpath_cache_result_cacher8   r  r  r  r.   r-   lxml.cssselectr  html_parser)r9   r   r:   r.   r-   r  r5   s         r;   r<   z#JsonLxmlExtractionStrategy.__init__  s    !'~*6*! "::mT:(.

3Mt(T% 	%.
&r=   ra  c                    	 | j                   j                  dd      }| j                   j                  ||      S # t        $ r}| j                  rt        d|        	 | j                  j                  |      cY d}~S # t        $ rD}| j                  rt        d|        | j                   j                  d      cY d}~cY d}~S d}~ww xY wd}~ww xY w)z&Parse HTML content with error recoveryT)recoverremove_blank_textz8Error parsing HTML, falling back to alternative method: NzCritical error parsing HTML: r-   )r.   
HTMLParser
fromstringr5  r3   rx   r  Element)r9   ra  parserr<  e2s        r;   rf  z&JsonLxmlExtractionStrategy._parse_html  s    	2ZZ**44*PF::((v>> 		2||PQRPSTU2''22<@@ 2<<9">?zz))&111	2		2sK   8; 	CCA?9C?	C4C<C=CCCCCc                 P   | j                   s|S ddl} |j                  d|      r|S |j                         }t	        |      dk  r|S t	        |      dkD  rKt        d |D              r9|D cg c]'  }|j                  d      s|j                  d      s&|) }}|r|d	   S |S c c}w )
z8Optimize common selector patterns for better performancer   Nztd:nth-child\(\d+\)r   r   c              3   d   K   | ](  }|j                  d       xs |j                  d       * yw).#N)rG  ).0ps     r;   	<genexpr>z@JsonLxmlExtractionStrategy._optimize_selector.<locals>.<genexpr>  s*     !XQ!,,s"3"Hq||C7H"H!Xs   .0r  r  )r  r  r  r   r   anyrG  )r9   selector_strr  partsr  specific_partss         r;   _optimize_selectorz-JsonLxmlExtractionStrategy._optimize_selector  s    ,, 	299+\: ""$u:? u:>c!XRW!XX).YA!,,s2Cq||TWGXaYNY%b))	 Zs   ,'B#B#c                 *     j                   r j                        	  j                        j                   j                  <   d fd	}|S # t
        $ r+} j                  rt        d d|        ddcY d}~S d}~ww xY w)z6Create a selector function that handles all edge casesc                    d }j                   rL| j                  dd      xs t        t        |             }| d }|j                  v rj                  |   S g }	  |       }|s|rj                  |       }|r| j                  |      }|sdv rj                  |       }|sj                  |       }|sTj                         }|rB|d   }t        j                  d|      }|r%|j                  d      }	| j                  d|	       }j                   r|r|j                  |<   |S # t        $ r(}
j                  rt        d	 d
|
        Y d }
~
|S d }
~
ww xY w)Nid z::	nth-childr  z^(\w+)r   .//Error applying selector '': )r  r8   rX   hashr  _make_context_sensitive_xpathxpath_handle_nth_child_selector_fallback_class_id_searchr   r  r  r  r5  r3   rx   )rm  context_sensitive	cache_key
element_idrT  context_xpathr  	last_part	tag_matchtag_namer<  compiledoriginal_selectorr  r9   r  s              r;   selector_funczKJsonLxmlExtractionStrategy._create_selector_function.<locals>.selector_func  s    	 ##!(T2!6!L#d7m:LJ#-,b ?I D$6$66#11)<<$P&w/G #'8(,(J(J5RY(Z(&-mmM&BG  ';:K+K&*&E&EgO`&aG  '&*&D&DWN_&`G  '$5$;$;$=E$,1"I	,.HHY	,J	#,/8q/AH.5mmc(<L.MG ''I8?**95 	 ! P|| 9,s1#NO	Ps   CD1 1	E":EE"Error compiling selector 'r  c                     g S r   rA   )rm  r   s     r;   r   zFJsonLxmlExtractionStrategy._create_selector_function.<locals>.<lambda>R  s    2 r=   N)T)r  r  r  pathr  r5  r3   rx   )r9   r  r	  r<  r  r  r  s   ``  @@@r;   _create_selector_functionz4JsonLxmlExtractionStrategy._create_selector_function  s    ( ((22<@LE	>''5HMME /4Dl+3 3j !  	>||2<.A3GH >=	>s   7A 	B' BBBc                     	 |j                  d      r|S |j                  d      rd| }nd| }	 |j                  |       |S #  d|j                  d      d    cY S xY w#  Y yxY w)z1Convert absolute XPath to context-sensitive XPathdescendant-or-self::/r  r  r  N)rG  r  r   )r9   r  rm  r  s       r;   r  z8JsonLxmlExtractionStrategy._make_context_sensitive_xpathT  s    	 67 $"#E7"%eW4m,$$4U[[-b1233	s'   A# A# A A A#  A# #A'c                    ddl }g }	  |j                  d|      }|r|j                  d      }|j                  d| dd      d   j	                         }|rA |j                  d|      }|r|j                  d      nd	}	|j                  d
| d|	       }|S |j                  d
| d      }|S # t        $ r%}
| j                  rt        d|
        Y d}
~
|S d}
~
ww xY w)z2Special handling for nth-child selectors in tablesr   Ntd:nth-child\((\d+)\)r   ztd:nth-child()r  z(\w+)r   .//td[]//]z#Error handling nth-child selector: )	r  r  r  r   r  r  r5  r3   rx   )r9   rm  r  r  rT  r  col_numremaining_selectorr  r  r<  s              r;   r  z5JsonLxmlExtractionStrategy._handle_nth_child_selectork  s    	ABII6EE++a. &2%7%7-yPQ8RTU%VWY%Z%`%`%b"% !*		(4F GI5>yq1CH%mmfWIS
,KLG  &mmfWIQ,?@G
 	  	A||;A3?@		As   BB/ B/ /	C8CCc                 x   g }	 ddl } |j                  d|      } |j                  d|      }|D ](  }|j                  d| d      }|j                  |       * |D ](  }	|j                  d|	 d      }
|j                  |
       * 	 |S # t        $ r%}| j
                  rt        d	|        Y d}~|S d}~ww xY w)
z!Fallback to search by class or IDr   Nz\.([a-zA-Z0-9_-]+)z#([a-zA-Z0-9_-]+)z.//*[contains(@class, 'z')]z
.//*[@id='z']z#Error in fallback class/id search: )r  findallr  rH   r5  r3   rx   )r9   rm  r  rT  r  class_matches
id_matches
class_nameclass_resultsid_name
id_resultsr<  s               r;   r  z4JsonLxmlExtractionStrategy._fallback_class_id_search  s    	A&BJJ'<lKM $$8,GJ , .
 '0G
|SV.W X}-.
 & +$]]Zy+CD
z*+ 	  	A||;A3?@		As   BB 	B9B44B9c                 x    || j                   vr| j                  |      | j                   |<   | j                   |   S )z.Get or create a selector function with caching)r  r  )r9   r  s     r;   _get_selectorz(JsonLxmlExtractionStrategy._get_selector  s=    t333151O1OP\1]D  .##L11r=   rt  c                 8    | j                  |      } ||d      S )rv  Fr   r"  r9   rk  rt  r	  s       r;   rg  z-JsonLxmlExtractionStrategy._get_base_elements  s    **84[EBBr=   c                 8    | j                  |      } ||d      S )z>Get child elements using the selector with context sensitivityTr$  r%  r9   rm  rt  r	  s       r;   rz  z(JsonLxmlExtractionStrategy._get_elements  s    **84W==r=   r?   c                 
   	 dj                  d |j                  d      D              }|S # t        $ rN}| j                  rt	        d|        	 |j                         j                         cY d}~S #  Y Y d}~yxY wd}~ww xY w)z$Extract normalized text from elementr   c              3   ^   K   | ]%  }|j                         s|j                          ' y wr   r  )r  r   s     r;   r  z?JsonLxmlExtractionStrategy._get_element_text.<locals>.<genexpr>  s     W!QWWYAGGIWs   --	.//text()zError extracting text: Nr  )r   r  r5  r3   rx   text_contentr  )r9   rm  r  r<  s       r;   r  z,JsonLxmlExtractionStrategy._get_element_text  sv    	88Ww}}[/IWWDK 	||/s34++-3355	s2   (+ 	BA=A2,B2A:4A=:A==Bc                     	 | j                   j                  |dd      S # t        $ r$}| j                  rt	        d|        Y d}~yd}~ww xY w)z)Get HTML string representation of elementunicoder-   )encodingr   zError serializing HTML: Nr  )r.   tostringr5  r3   rx   )r9   rm  r<  s      r;   r  z,JsonLxmlExtractionStrategy._get_element_html  sM    	::&&w6&RR 	||045	s     	AAAr  c                     	 |j                  |      S # t        $ r'}| j                  rt        d| d|        Y d}~yd}~ww xY w)zGet attribute value safelyzError getting attribute 'r  N)r8   r5  r3   rx   )r9   rm  r  r<  s       r;   r  z1JsonLxmlExtractionStrategy._get_element_attribute  sF    	;;y)) 	||1)CsCD	s    	A>Ac                 R    | j                   r| j                  j                          yy)zClear caches to free memoryN)r  r  clear)r9   s    r;   _clear_cachesz(JsonLxmlExtractionStrategy._clear_caches  s"    $$& r=   )r6   rU   rV   r   rX   r   r<   rf  r  r  r  r  r  r"  rg  rz  r  r  r  r4  r   r   s   @r;   r  r    s    'tCH~ '$2 2 .M>^.:62C C>s >
C C  'r=   r  c                   |     e Zd Zdeeef   f fdZdefdZd ZdefdZ	defdZ
d	efd
Zd	efdZdefdZ xZS ) JsonLxmlExtractionStrategy_naiver   c                 >    d|d<   t        |   |fi | i | _        y r  )ri   r<   r  r`  s      r;   r<   z)JsonLxmlExtractionStrategy_naive.__init__  s'    !'~*6*!r=   ra  c                 Z    ddl m}  |j                  d      } |j                  ||      S )Nr   r.   T)r  )r  r.   r  r  )r9   ra  r.   r  s       r;   rf  z,JsonLxmlExtractionStrategy_naive._parse_html  s,    !!!$/uf55r=   c                      j                   vr%ddlm} 	  |       fd}| j                   <    j                      S # t        $ r9} j                  rt        d d|        d }| j                   <   Y d}~Ld}~ww xY w)zCGet a selector function that works within the context of an elementr   r  c                    	  
|       }|r|S 
j                   }|j                  d      r|}nd|j                  d       }| j                  |      }|r|S dv r{dd l} |j
                  d      }|rb|j                  d      }j                  dd      d	   j                         }|r| j                  d
| d|       S | j                  d
| d      S j                         }t        |      dkD  r|d	   r| j                  d|d	          S g S # t        $ r)}	j                  rt        d d|	        g cY d }	~	S d }	~	ww xY w)Nr  z./r  r  r   r  r   r  r  r  r  r  r  r  r  )r  rG  lstripr  r  r  r  r   r  r   r5  r3   rx   )rm  rT  r  r  r  r  r  sub_selectorr  r<  r  r  r9   s             r;   select_funczCJsonLxmlExtractionStrategy_naive._get_selector.<locals>.select_func  sb   +""*7"3"#*N !) !++,BC,1M /1c1B0C,DM")--">"#*N ',6%$-BII.F$UE$*/++a./;/A/A#q/I"/M/S/S/U#/+2==6'#l^9\+]$]+2==6'!9L+M$M !- 2 2 4u:>eBi#*==3uRyk1B#CC!	$ "<<!$=l^3qc"RS!	"s<   D AD A)D D 9D D 	EE :E Er
  r  c                     g S r   rA   )rm  s    r;   fallback_funczEJsonLxmlExtractionStrategy_naive._get_selector.<locals>.fallback_func"  s    Ir=   N)r  r  r  r5  r3   rx   )r9   r  r  r>  r<  r@  r  s   ``    @r;   r"  z.JsonLxmlExtractionStrategy_naive._get_selector  s    t3332<C&|4,"\ 6A$$\2 ##L11  C<<6|nCsKL 6C$$\2Cs   A 	B/BBrt  c                 4    | j                  |      } ||      S r   r%  r&  s       r;   rg  z3JsonLxmlExtractionStrategy_naive._get_base_elements)  s    **84[))r=   c                 4    | j                  |      } ||      S r   r%  r(  s       r;   rz  z.JsonLxmlExtractionStrategy_naive._get_elements-  s    **84W%%r=   r?   c                 ^    dj                  |j                  d            j                         S Nr  r+  r   r  r  r  s     r;   r  z2JsonLxmlExtractionStrategy_naive._get_element_text1  #    www}}[1288::r=   c                 6    ddl m}  |j                  |d      S )Nr   r9  r.  r/  )r  r.   r0  )r9   rm  r.   s      r;   r  z2JsonLxmlExtractionStrategy_naive._get_element_html4  s    u~~g	::r=   r  c                 $    |j                  |      S r   r  r  s      r;   r  z7JsonLxmlExtractionStrategy_naive._get_element_attribute8  r  r=   )r6   rU   rV   r   rX   r   r<   rf  r"  rg  rz  r  r  r  r   r   s   @r;   r6  r6    sf    "tCH~ "
6 6
B2H* *&s &;C ;;C ;& &r=   r6  c                        e Zd ZdZdeeef   f fdZdefdZdefdZ	ded	efd
Z
ded	efdZdefdZd	efdZd	efdZdefdZ xZS )JsonXPathExtractionStrategya  
    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.

    How it works:
    1. Parses HTML content into an lxml tree.
    2. Selects elements using XPath expressions.
    3. Converts CSS selectors to XPath when needed.

    Attributes:
        schema (Dict[str, Any]): The schema defining the extraction rules.
        verbose (bool): Enables verbose logging for debugging purposes.

    Methods:
        _parse_html(html_content): Parses HTML content into an lxml tree.
        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
        _get_elements(element, selector): Selects child elements using an XPath selector.
        _get_element_text(element): Extracts text content from an lxml element.
        _get_element_html(element): Extracts the raw HTML content of an lxml element.
        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
    r   c                 0    d|d<   t        |   |fi | y r  r  r`  s      r;   r<   z$JsonXPathExtractionStrategy.__init__R  r  r=   ra  c                 ,    t        j                  |      S r   )r-   r  rr  s     r;   rf  z'JsonXPathExtractionStrategy._parse_htmlV  s    |,,r=   rt  c                 $    |j                  |      S r   )r  rw  s      r;   rg  z.JsonXPathExtractionStrategy._get_base_elementsY  s      **r=   css_selectorr?   c                 0    d|v r|S | j                  |      S )z'Convert CSS selector to XPath if neededr  )_basic_css_to_xpath)r9   rO  s     r;   _css_to_xpathz)JsonXPathExtractionStrategy._css_to_xpath\  s!    ,''55r=   c                     d|v r%|j                  d      }ddj                  |      z   S d|v r%|j                  d      }ddj                  |      z   S d|z   S )z.Basic CSS to XPath conversion for common casesz > z//r  r   )r   r   )r9   rO  r  s      r;   rQ  z/JsonXPathExtractionStrategy._basic_css_to_xpathb  sd    L  &&u-E#((5/)), &&s+E$))E***l""r=   c                 r    | j                  |      }|j                  d      sd|z   }|j                  |      S )Nr  )rR  rG  r  )r9   rm  rt  r  s       r;   rz  z)JsonXPathExtractionStrategy._get_elementsl  s8    ""8,$%KE}}U##r=   c                 ^    dj                  |j                  d            j                         S rD  rE  r  s     r;   r  z-JsonXPathExtractionStrategy._get_element_textr  rF  r=   c                 0    t        j                  |d      S )Nr.  rH  )r.   r0  r  s     r;   r  z-JsonXPathExtractionStrategy._get_element_htmlu  s    ~~g	::r=   r  c                 $    |j                  |      S r   r  r  s      r;   r  z2JsonXPathExtractionStrategy._get_element_attributex  r  r=   )r6   rU   rV   rW   r   rX   r   r<   rf  rg  rR  rQ  rz  r  r  r  r   r   s   @r;   rK  rK  ;  s    ,+tCH~ +- -+ +6# 6# 6# # #$s $;C ;;C ;& &r=   rK      z	
z\x02xz\x08z(?<!\\)\\(?![\\u])r   r?   c                 H   i }| j                         D ]_  \  }}t        j                  d|      j                  t              }t
        j                  d|      }	 t        j                  |       |||<   a |S # t        j                  $ r}t        d| d|       dd}~ww xY w)z>Fix common JSON-escape goofs coming from LLMs or manual edits.z\\b\\\\zRegex for 'u   ' won’t compile after fix: N)
r   _WB_FIXsub	translate_CTRL_NEEDS_ESCAPEr  compiler   
ValueError)r   safer   patr<  s        r;   _sanitize_schemare    s    Dlln 
skk&#&007 -	^JJsO U K	 xx 	^{5'1NqcRSY]]	^s   A77B!
BB!c                       e Zd ZU dZ G d de      Zej                  Zej                  Z	ej                  Zej                  Zej                  Zej                   Zej$                  Zej(                  Zej,                  Zej0                  Zej4                  Zej8                  Zej<                  Zej@                  Z!ejD                  Z#ejH                  Z%ejL                  Z'ejP                  Z)ejT                  Z+ejX                  Z-ej\                  Z/ej`                  Z1 ed      Z2i ddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+Z3e4e5e5f   e6d,<   e7jp                  e7jr                  z  Z:d-d-d.Z;ejx                  fd/d0d1d2dd3e=e>e4e5e5f   e?e@e5e5f      f      d4e5d5d/f fd6ZAde5d7e5d5e?e4e5eBf      fd8ZCd7e5d5e5fd9ZDeEd/d/d/d:d;e5d<e5d=e=e5   d>e=e?e5      d?e=eF   d5e4e5e5f   fd@       ZG xZHS )ARegexExtractionStrategya  
    A lean strategy that finds e-mails, phones, URLs, dates, money, etc.,
    using nothing but pre-compiled regular expressions.

    Extraction returns::

        {
            "url":   "<page-url>",
            "label": "<pattern-label>",
            "value": "<matched-string>",
            "span":  [start, end]
        }

    Only `generate_schema()` touches an LLM, extraction itself is pure Python.
    c                      e Zd Z e       Z e       Z e       Z e       Z e       Z e       Z	 e       Z
 e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Z e       Zeez  ez  ez  ez  e	z  e
z  ez  ez  ez  ez  ez  ez  ez  ez  ez  ez  ez  ez  ez  ez  Zy)RegexExtractionStrategy._BN)r6   rU   rV   r   EMAIL
PHONE_INTLPHONE_USr  IPV4IPV6UUIDCURRENCY
PERCENTAGENUMBERDATE_ISODATE_USTIME_24H	POSTAL_US	POSTAL_UKHTML_COLOR_HEXTWITTER_HANDLEHASHTAGMAC_ADDRIBANCREDIT_CARDNOTHINGALLrA   r=   r;   _Bri    s=   &&
&&&&&&&
&&&&&	&	&&&&&&&J)C/$6=D#$&,-/78:ABDLM#$&457EF  !! $(( +66 	r=   r  r   emailz[\w.+-]+@[\w-]+\.[\w.-]+
phone_intlz\+?\d[\d .()-]{7,}\dphone_usz#\(?\d{3}\)?[ -. ]?\d{3}[ -. ]?\d{4}r>   zhttps?://[^\s\"'<>]+ipv4z(?:\d{1,3}\.){3}\d{1,3}ipv6z"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}uuidzG[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}currencyu,   (?:USD|EUR|RM|\$|€|£)\s?\d+(?:[.,]\d{2})?
percentagez\d+(?:\.\d+)?%numberz%\b\d{1,3}(?:[,.\s]\d{3})*(?:\.\d+)?\bdate_isoz\d{4}-\d{2}-\d{2}date_usz\d{1,2}/\d{1,2}/\d{2,4}time_24hz.\b(?:[01]?\d|2[0-3]):[0-5]\d(?:[:.][0-5]\d)?\b	postal_usz\b\d{5}(?:-\d{4})?\b	postal_ukz$\b[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\bhtml_color_hexz#[0-9A-Fa-f]{6}\btwitter_handlez@[\w]{1,15}z#[\w-]+z$(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}z[A-Z]{2}\d{2}[A-Z0-9]{11,30}zH\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|6(?:011|5\d{2})\d{12})\b)hashtagmac_addribancredit_cardDEFAULT_PATTERNSzUse llm_config insteadr   Nfit_html)customr1   r  r  r1   r?   c          	      (   t        	|   dd|i| | j                  j                         D ci c]9  \  }}t	        | j
                  |j                               j                  |z  r||; }}}|rGt        |t              r|j                  |       n%|j                  |D ci c]  \  }}||
 c}}       |j                         D ci c]&  \  }}|t        j                  || j                        ( c}}| _        yc c}}w c c}}w c c}}w )a  
        Args:
            patterns: Custom patterns overriding or extending defaults.
                      Dict[label, regex] or list[tuple(label, regex)].
            input_format: "html", "markdown" or "text".
            **kwargs: Forwarded to ExtractionStrategy.
        r1   NrA   )ri   r<   r  r   getattrr  r  r  r1  r2  rj  r  ra  _FLAGS	_compiled)
r9   r  r  r1   r:   r   rxmergedlblr5   s
            r;   r<   z RegexExtractionStrategy.__init__  s     	=l=f=
  00668"
Rtww		,22W< G"
 "
 &$'f%f=73sBw=> =CLLN.
18bCB,,.
"
 >.
s   >D%D
+Dr]   c           
          g }| j                   j                         D ]^  \  }}|j                  |      D ]E  }|j                  |||j	                  d      |j                         |j                         gd       G ` |S )Nr   )r>   r   r  span)r  r   finditerr   r  startend)	r9   r>   r]   rC   kwoutr   crems	            r;   rD   zRegexExtractionStrategy.extract4  s|    $&....0 		JE3\\'* 

"!&!"!"AEEG 4			 
r=   c                 ^    | j                   dk(  r|S t        |d      j                  dd      S )Nr  r  r   Tr  )r1   r,   r  )r9   r]   s     r;   _plain_textz#RegexExtractionStrategy._plain_textG  s2    &NWf-66s$6GGr=   )r  examplesr   r   r-   r  r  r   c                   t         j                  D ]'  }||v st        | dt         j                  |           |
t               }d|  d|  d}d|dd dg}|r"|j	                  d	|j                                 |r&|j	                  d
dj                  |dd       z          dj                  |      }	t        |j                  dj                  ||	g      d|j                  |j                  |      }
|
j                  d   j                  j                  }|j                  dd      }t        j                   dd|      }	 t#        j$                  |      }|j+                         D ]  \  }}	 t        j,                  |        |S # t&        $ r}t)        d|       |d}~ww xY w# t        j.                  $ r}t)        d| d|       dd}~ww xY w)u   
        Ask an LLM for a single page-specific regex and return
            {label: pattern}   ── ready for RegexExtractionStrategy(custom=…)
        z is deprecated, NzaYou are an expert Python-regex engineer.
Return **one** JSON object whose single key is exactly "u  ", and whose value is a raw-string regex pattern that works with the standard `re` module in Python.

Strict rules (obey every bullet):
• If a *user query* is supplied, treat it as the precise semantic target and optimise the   pattern to capture ONLY text that answers that query. If the query conflicts with the   sample HTML, the HTML wins.
• Tailor the pattern to the *sample HTML* – reproduce its exact punctuation, spacing,   symbols, capitalisation, etc. Do **NOT** invent a generic form.
• Keep it minimal and fast: avoid unnecessary capturing, prefer non-capturing `(?: … )`,   and guard against catastrophic backtracking.
• Anchor with `^`, `$`, or `\b` only when it genuinely improves precision.
• Use inline flags like `(?i)` when needed; no verbose flag comments.
• Output must be valid JSON – no markdown, code fences, comments, or extra keys.
• The regex value must be a Python string literal: **double every backslash** (e.g. `\\b`, `\\d`, `\\\\`).

Example valid output:
{"z5": "(?:RM|rm)\\s?\\d{1,3}(?:,\\d{3})*(?:\\.\\d{2})?"}z```htmli  z```z

## Query
z## Examples
r^  r   r  Tr  r   z\bz(?<!\\)\\(?![\\u"])r[  zLLM did not return valid JSON: zInvalid regex for 'r  )rg  r   r  r)   r   r  r   r   r   r   r   r.  r/  r]   r(  r  r]  r&  r0  r5  rb  r   ra  r   )r   r-   r  r  r   r:   k
system_msg
user_partsuser_msgresprawpattern_dictexcr  rd  r<  s                    r;   generate_patternz(RegexExtractionStrategy.generate_patternR  s     )88 	AF{$c)*A*Q*QRS*T)UV 	 *,JHHMw O" 'b%d 	.  ete4
u{{}o>?o		(3B-0HHI;;z* /(("(++z8.D"E **((
 ll1o%%--kk&%(ff+Wc:	O::c?L
 %**, 	NHCN

3	N   	O>seDE3N	O 88 N #6se3qc!BCMNs0   F .F'	F$FF$'G:GG)Ir6   rU   rV   rW   r   r  rj  Emailrk  	PhoneIntlrl  PhoneUSr  Urlrm  IPv4rn  IPv6ro  Uuidrp  Currencyrq  
Percentagerr  Numberrs  DateIsort  DateUSru  Time24hrv  PostalUSrw  PostalUKrx  HexColorry  TwitterHandlerz  Hashtagr{  MacAddrr|  Ibanr}  
CreditCardr  AllNothingr  r   rX   __annotations__r  
IGNORECASE	MULTILINEr  r   r~  r   r   r   r	   r<   r   rD   r  r  r(   r  r   r   s   @r;   rg  rg    s,   &
W 
> XXE]]I[[GVVCWWDWWDWWD[[H]]JYYF[[GZZF[[G\\H\\H&&H&&MZZG[[GWWD^^JVVCUG
(6( 	2( 	A	( 	2( 	5( 	@( 	e( 	J( 	,( 	C(  	/!(" 	5#($ 	L%(( 	2)(* 	B+(, 	/-(. 	>/(0 &B:f7(d38n < ]]R\\)F,-O 

!
 JN&!
!
 tCH~tE#s(O/DDEF	!

 !
 
!
L3  4S#X;O &H3 H3 H 
  $(,*.SSS }	S
 49%S Y'S 
c3hS Sr=   rg  )Pabcr   r   r   typingr   r   r   r   r	   r
   r   concurrent.futuresr   r   r&  rq   enumr   r   r  r   r   r   r   r   configr   r   r   r   r   rN  r   r   r   r   r    r!   r"   modelsr#   model_loaderr$   r%   r&   r'   typesr(   r)   	functoolsr*   rj   r   r  bs4r,   r  r-   r.   r0   rZ   rd   r   r]  r  r  r6  rK  mapchrr   ordr_  ra  r\  r`  rX   re  rg  )cs   0r;   <module>r     s   #  C C C ?    v  v         0   	  <N <N~
- 
.`D' `DLp. plwD$6 wDr-& = -&^M'!> M'^_&'D _&B>&"? >&@
 ),Cr(;Q1q?Pr#a&	Q
"**W


01T#s(^ S#X (M0 M3 	Rs   	E=)E=