
    /i                         d Z ddlmZmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlZddlZddlmZmZ ddlmZmZ ddlZdd	lmZmZ ddlZddlZ G d
 de      Z G d de      Z G d de      Z G d de      Zy)z
Table extraction strategies for Crawl4AI.

This module provides various strategies for detecting and extracting tables from HTML content.
The strategy pattern allows for flexible table extraction methods while maintaining a consistent interface.
    )ABCabstractmethod)DictListOptionalAnyUnionTuple)etreeN   )	LLMConfigcreate_llm_config)perform_completion_with_backoffsanitize_html)ThreadPoolExecutoras_completedc                   j    e Zd ZdZd Zedej                  dee	e
ef      fd       Zdde
de
de
fd	Zy
)TableExtractionStrategyz
    Abstract base class for all table extraction strategies.
    
    This class defines the interface that all table extraction strategies must implement.
    It provides a consistent way to detect and extract tables from HTML content.
    c                 `    |j                  dd      | _        |j                  dd      | _        y)z
        Initialize the table extraction strategy.
        
        Args:
            **kwargs: Additional keyword arguments for specific strategies
        verboseFloggerN)getr   r   )selfkwargss     U/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/table_extraction.py__init__z TableExtractionStrategy.__init__   s(     zz)U3jj40    elementreturnc                      y)ad  
        Extract tables from the given HTML element.
        
        Args:
            element: The HTML element (typically the body or a container element)
            **kwargs: Additional parameters for extraction
            
        Returns:
            List of dictionaries containing table data, each with:
                - headers: List of column headers
                - rows: List of row data (each row is a list)
                - caption: Table caption if present
                - summary: Table summary attribute if present
                - metadata: Additional metadata about the table
        N r   r   r   s      r   extract_tablesz&TableExtractionStrategy.extract_tables'   s    " 	r   levelmessagetagc                 h    | j                   r&t        | j                   |d      }|r |d||d| yyy)z#Helper method to safely use logger.N)r%   r&   r!   )r   getattr)r   r$   r%   r&   r   
log_methods         r   _logzTableExtractionStrategy._log:   s:    ;; eT:J>7>v>  r   N)TABLE)__name__
__module____qualname____doc__r   r   r   Elementr   r   strr   r#   r*   r!   r   r   r   r      sZ    1 emm $tCQTH~BV  $?# ? ?# ?r   r   c                        e Zd ZdZ fdZdej                  deee	e
f      fdZdej                  defdZdej                  dee	e
f   fdZ xZS )	DefaultTableExtractiona^  
    Default table extraction strategy that implements the current Crawl4AI table extraction logic.
    
    This strategy uses a scoring system to identify data tables (vs layout tables) and
    extracts structured data including headers, rows, captions, and summaries.
    It handles colspan and rowspan attributes to preserve table structure.
    c                     t        |   di | |j                  dd      | _        |j                  dd      | _        |j                  dd      | _        y)a  
        Initialize the default table extraction strategy.
        
        Args:
            table_score_threshold (int): Minimum score for a table to be considered a data table (default: 7)
            min_rows (int): Minimum number of rows for a valid table (default: 0)
            min_cols (int): Minimum number of columns for a valid table (default: 0)
            **kwargs: Additional parameters passed to parent class
        table_score_threshold   min_rowsr   min_colsNr!   )superr   r   r5   r7   r8   )r   r   	__class__s     r   r   zDefaultTableExtraction.__init__K   sM     	"6"%+ZZ0G%K"

:q1

:q1r   r   r   c           	         g }|j                  d| j                        }|j                  d      }|D ]  }| j                  ||      s	 | j	                  |      }| j
                  dkD  r)t        |j                  dg             | j
                  k  ra| j                  dkD  rbt        |j                  dg             xs5 |j                  d      r"t        d |j                  dg       D              nd}|| j                  k  r|j                  |        |S # t        $ r*}	| j                  dd	t        |	       d
       Y d}	~	d}	~	ww xY w)a>  
        Extract all data tables from the HTML element.
        
        Args:
            element: The HTML element to search for tables
            **kwargs: Additional parameters (can override instance settings)
            
        Returns:
            List of dictionaries containing extracted table data
        r5   .//table)r5   r   rowsheadersc              3   2   K   | ]  }t        |        y wNlen.0rows     r   	<genexpr>z8DefaultTableExtraction.extract_tables.<locals>.<genexpr>x   s     OSCO   errorzError extracting table data: TABLE_EXTRACTN)r   r5   xpathis_data_tableextract_table_datar7   rB   r8   maxappend	Exceptionr*   r1   )
r   r   r   tables_datascore_thresholdtablestable
table_data	col_countes
             r   r#   z%DefaultTableExtraction.extract_tablesZ   sA     !**%<d>X>XY z* 	E!!%!O!%!8!8!?J }}q(S1K-Lt}}-\ }}q($'
y"(E$F %S]SaSabhSiCOJNN624NOOop " %t}}4$&&z2!	* 	 ! IIg)Fs1vh'OQ`as&   
ADA0DD	E"EErS   c                    d}t        |j                  d            dkD  }t        |j                  d            dkD  }|r|dz  }|r|dz  }t        |j                  d            }|dkD  r|dz  }|s|j                  d      r|dz  }t        |j                  d            dkD  r|d	z  }|j                  d
d      j                         }|dv r|d	z  }|j                  d      }|sy|D 	cg c]  }	t        |	j                  d             }
}	|
rAt	        |
      t        |
      z  t	        fd|
D              t        |
      z  }|dk  r|dz  }|j                  d      r|dz  }|j                  d      r|dz  }t	        d |D              }t	        d |j                         D              }||dz   z  }|dkD  r|d	z  }n
|dkD  r|dz  }t	        d |j                  D              }||dz  z  }|
r/t        |      dk\  r!t	        |
      t        |
      z  dk\  r|dz  }|j                  d| j                        }||k\  S c c}	w )a[  
        Determine if a table is a data table (vs. layout table) using a scoring system.
        
        Args:
            table: The table element to evaluate
            **kwargs: Additional parameters (e.g., table_score_threshold)
            
        Returns:
            True if the table scores above the threshold, False otherwise
        r   .//thead.//tbody   r   .//th.//tr[1]/thr<      role >   nonepresentation.//trF.//td|.//thc              3   .   K   | ]  }|z
  d z    yw)rZ   Nr!   )rD   cavg_colss     r   rF   z7DefaultTableExtraction.is_data_table.<locals>.<genexpr>   s     AAL1,As   z
.//captionsummaryc              3      K   | ]P  }|j                  d       D ]:  }t        dj                  |j                               j	                                < R yw)rc   r_   N)rJ   rB   joinitertextstrip)rD   rE   cells      r   rF   z7DefaultTableExtraction.is_data_table.<locals>.<genexpr>   sS      
		-0
  (..01
1
s   AAc              3       K   | ]  }d   yw)r   Nr!   )rD   _s     r   rF   z7DefaultTableExtraction.is_data_table.<locals>.<genexpr>   s     <q<s   gh㈵>   
   c              3   D   K   | ]  }|j                  d       sd  yw)zdata-r   N)
startswith)rD   attrs     r   rF   z7DefaultTableExtraction.is_data_table.<locals>.<genexpr>   s     Pttw7OPs     g      ?r5   )rB   rJ   r   lowersumiterdescendantsattribr5   )r   rS   r   score	has_thead	has_tbodyth_countr^   r=   rE   
col_countsvariance
total_text
total_tags
text_ratio
data_attrs	thresholdrf   s                    @r   rK   z$DefaultTableExtraction.is_data_table   sa     J/014	J/014	QJEQJE u{{7+,a<QJEEKK6
 u{{:&'!+QJE yy$**,++QJE {{7#?CDc#))M23D
D:Z8HAjAAC
OSH!|
 ;;|$QJE99YQJE  

 


 <E$9$9$;<<
:#45
?QJE"_QJE Pu||PP
c!! #d)q.:Z8H1}
JJ68R8RS		!!K Es   +!Ic                    |j                  d      }|r|d   j                         nd}|j                  dd      j                         }g }|j                  d      }|rj|d   j                  d      }|D ]P  }|j                         j                         }t	        |j                  dd            }	|j                  |g|	z         R nz|j                  d	      }
|
rg|
d   j                  d
      D ]P  }|j                         j                         }t	        |j                  dd            }	|j                  |g|	z         R g }|j                  d      D ]|  }g }|j                  d      D ]P  }|j                         j                         }t	        |j                  dd            }	|j                  |g|	z         R |sl|j                  |       ~ |rt        |      n|rt        d |D              nd}g }|D ]+  }|d| dg|t        |      z
  z  z   }|j                  |       - |s#|dkD  rt        |      D cg c]
  }d|dz     }}t        |      |t        |      xs t        |j                  d            t        |      t        |      d}|j                  d      r|j                  d      |d<   |j                  d      r|j                  d      |d<   |||||dS c c}w )a  
        Extract structured data from a table element.
        
        Args:
            table: The table element to extract data from
            
        Returns:
            Dictionary containing:
                - headers: List of column headers
                - rows: List of row data (each row is a list)
                - caption: Table caption if present
                - summary: Table summary attribute if present
                - metadata: Additional metadata about the table
        z.//caption/text()r   r_   rg   z.//thead/trr[   colspanr   z.//tr[1]z.//th|.//tdz.//tr[not(ancestor::thead)]z.//tdc              3   2   K   | ]  }t        |        y wr@   rA   rC   s     r   rF   z<DefaultTableExtraction.extract_table_data.<locals>.<genexpr>  s     )SC)rG   NzColumn r\   )	row_countcolumn_counthas_headershas_captionhas_summaryidclassr>   r=   captionrg   metadata)rJ   rk   r   text_contentintextendrN   rB   rM   rangebool)r   rS   r   rg   r>   
thead_rowsheader_cellsrl   textr   	first_rowr=   rE   row_datamax_columnsaligned_rowsalignedir   s                      r   rL   z)DefaultTableExtraction.extract_table_data   s     ++12(/'!*""$R))Ir*002 [[/
%a=..w7L$ 1((*002dhhy!45v/01 J/I%aL..}= 5D,,.446D!$((9a"89GNND6G#345 ;;<= 	&CH		'* 2((*002dhhy!45 012 H%	& '.c'l-1C)D))q 	  	)C,;'2$+C2H*IIG(	)
 ;?05k0BC11CGC \*'
+OtEKK4N/O==
 99T?"YYt_HTN99W %		' 2HW   
 	
# Ds   =K()r,   r-   r.   r/   r   r   r0   r   r   r1   r   r#   r   rK   rL   __classcell__r:   s   @r   r3   r3   B   sp    2(emm ($tCQTH~BV (TO"5== O"t O"bS
 S
$sCx. S
r   r3   c                   D    e Zd ZdZdej
                  deeee	f      fdZ
y)NoTableExtractionz
    A strategy that does not extract any tables.
    
    This can be used to explicitly disable table extraction when needed.
    r   r   c                     g S )z
        Return an empty list (no tables extracted).
        
        Args:
            element: The HTML element (ignored)
            **kwargs: Additional parameters (ignored)
            
        Returns:
            Empty list
        r!   r"   s      r   r#   z NoTableExtraction.extract_tables2  s	     	r   N)r,   r-   r.   r/   r   r0   r   r   r1   r   r#   r!   r   r   r   r   +  s,    emm $tCQTH~BV r   r   c                   n    e Zd ZdZdZ	 	 	 	 	 	 	 	 d(dee   dee   dede	deded	ed
e	f fdZ
dej                  deeeef      fdZdedefdZdede	fdZdedeeej                     eej                     eej                     e	f   fdZdedeee   e	f   fdZdedee   dee   defdZdee   dedee   fdZd)dededede	deeef   f
d Zd!eeeef      deeeef      fd"Zdedeeeef      fd#Zdej                  dedeej                     fd$Zd%ede	fd&Zd%edeeef   fd'Z xZS )*LLMTableExtractionag  
    LLM-based table extraction strategy that uses language models to intelligently extract 
    and structure table data, handling complex cases like rowspan, colspan, and nested tables.
    
    This strategy uses an LLM to understand table structure semantically and convert it to 
    structured data that can be easily consumed by pandas DataFrames.
    aw*  You are a specialized table extraction system that converts complex HTML tables into structured JSON data. Your primary goal is to handle difficult, irregular HTML tables that cannot be easily parsed by standard tools, transforming them into clean, tabulated data.

## Critical Requirements

**IMPORTANT**: You must extract **EVERY SINGLE ROW** from the table, regardless of size. Tables often contain hundreds of rows, and omitting data is unacceptable. The reason we use an LLM for this task is because these tables have complex structures that standard parsers cannot handle properly.

## Output Format

**Your response must be valid JSON**. The output must be properly formatted, parseable JSON with:
- Proper escaping of quotes in strings
- Valid JSON syntax (commas, brackets, etc.)
- No trailing commas
- Proper handling of special characters

## Table Structure

Every table should be extracted as a JSON object with this structure:

```json
{
    "headers": ["Column 1", "Column 2", ...],
    "rows": [
        ["Row 1 Col 1", "Row 1 Col 2", ...],
        ["Row 2 Col 1", "Row 2 Col 2", ...],
        // ... continue for ALL rows ...
    ],
    "caption": "Table caption if present",
    "summary": "Table summary attribute if present",
    "metadata": {
        "row_count": <actual_number_of_rows>,
        "column_count": <number>,
        "has_headers": <boolean>,
        "has_merged_cells": <boolean>,
        "nested_tables": <boolean>,
        "table_type": "data|pivot|matrix|nested"
    }
}
```

## Handling Complex Structures

### Why This Matters
Standard HTML parsers fail on tables with:
- Complex colspan/rowspan arrangements
- Nested tables
- Irregular structures
- Mixed header patterns

Your job is to intelligently interpret these structures and produce clean, regular data.

### Colspan (Merged Columns)
When a cell spans multiple columns, duplicate the value across all spanned columns to maintain rectangular data structure.

Example HTML:
```html
<tr>
    <td colspan="3">Quarterly Report</td>
    <td>Total</td>
</tr>
```
Becomes: ["Quarterly Report", "Quarterly Report", "Quarterly Report", "Total"]

### Rowspan (Merged Rows)
When a cell spans multiple rows, duplicate the value down all affected rows.

Example with many rows:
```html
<tr>
    <td rowspan="50">Category A</td>
    <td>Item 1</td>
    <td>$100</td>
</tr>
<tr>
    <td>Item 2</td>
    <td>$200</td>
</tr>
<!-- ... 48 more rows ... -->
```

Result structure (response must be valid JSON):
```json
{
    "headers": ["Category", "Item", "Price"],
    "rows": [
        ["Category A", "Item 1", "$100"],
        ["Category A", "Item 2", "$200"],
        ["Category A", "Item 3", "$300"],
        ["Category A", "Item 4", "$400"],
        ["Category A", "Item 5", "$500"],
        // ... ALL 50 rows must be included ...
        ["Category A", "Item 50", "$5000"]
    ],
    "metadata": {
        "row_count": 50,
        "column_count": 3,
        "has_headers": true,
        "has_merged_cells": true,
        "nested_tables": false,
        "table_type": "data"
    }
}
```

### Nested Tables
For tables containing other tables:
1. Extract the outer table structure
2. Represent nested tables as a JSON string or structured representation
3. Ensure the data remains usable

## Complete Examples

### Example 1: Large Table with Complex Structure

Input HTML (abbreviated for documentation):
```html
<table>
    <thead>
        <tr>
            <th rowspan="2">Department</th>
            <th colspan="4">2024 Performance</th>
        </tr>
        <tr>
            <th>Q1</th>
            <th>Q2</th>
            <th>Q3</th>
            <th>Q4</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td rowspan="15">Sales</td>
            <td>Region North</td>
            <td>$1.2M</td>
            <td>$1.5M</td>
            <td>$1.8M</td>
        </tr>
        <tr>
            <td>Region South</td>
            <td>$0.9M</td>
            <td>$1.1M</td>
            <td>$1.3M</td>
        </tr>
        <!-- ... 13 more regions ... -->
        <tr>
            <td rowspan="20">Engineering</td>
            <td>Team Alpha</td>
            <td>85%</td>
            <td>88%</td>
            <td>92%</td>
        </tr>
        <!-- ... 19 more teams ... -->
        <!-- ... continue for 200+ total rows ... -->
    </tbody>
</table>
```

Output (showing structure with all rows) - must be valid JSON:
```json
{
    "headers": ["Department", "Team/Region", "Q1", "Q2", "Q3", "Q4"],
    "rows": [
        ["Sales", "Region North", "$1.2M", "$1.5M", "$1.8M"],
        ["Sales", "Region South", "$0.9M", "$1.1M", "$1.3M"],
        ["Sales", "Region East", "$1.1M", "$1.4M", "$1.6M"],
        ["Sales", "Region West", "$1.0M", "$1.2M", "$1.5M"],
        ["Sales", "Region Central", "$0.8M", "$1.0M", "$1.2M"],
        // ... ALL 15 Sales rows must be included ...
        ["Engineering", "Team Alpha", "85%", "88%", "92%"],
        ["Engineering", "Team Beta", "82%", "85%", "89%"],
        ["Engineering", "Team Gamma", "88%", "90%", "93%"],
        // ... ALL 20 Engineering rows must be included ...
        // ... Continue for EVERY row in the table ...
    ],
    "caption": "",
    "summary": "",
    "metadata": {
        "row_count": 235,
        "column_count": 6,
        "has_headers": true,
        "has_merged_cells": true,
        "nested_tables": false,
        "table_type": "data"
    }
}
```

### Example 2: Pivot Table with Hundreds of Rows

Input structure:
```html
<table>
    <tr>
        <th>Product ID</th>
        <th>Jan</th>
        <th>Feb</th>
        <!-- ... all 12 months ... -->
    </tr>
    <tr>
        <td>PROD-001</td>
        <td>1,234</td>
        <td>1,456</td>
        <!-- ... -->
    </tr>
    <!-- ... 500+ product rows ... -->
</table>
```

Output must include ALL rows and be valid JSON:
```json
{
    "headers": ["Product ID", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
    "rows": [
        ["PROD-001", "1,234", "1,456", "1,789", "2,012", "2,234", "2,456", "2,678", "2,890", "3,123", "3,345", "3,567", "3,789"],
        ["PROD-002", "2,345", "2,567", "2,789", "3,012", "3,234", "3,456", "3,678", "3,890", "4,123", "4,345", "4,567", "4,789"],
        ["PROD-003", "3,456", "3,678", "3,890", "4,123", "4,345", "4,567", "4,789", "5,012", "5,234", "5,456", "5,678", "5,890"],
        // ... ALL 500+ rows MUST be included ...
        ["PROD-547", "9,876", "10,098", "10,321", "10,543", "10,765", "10,987", "11,210", "11,432", "11,654", "11,876", "12,098", "12,321"]
    ],
    "metadata": {
        "row_count": 547,
        "column_count": 13,
        "has_headers": true,
        "has_merged_cells": false,
        "nested_tables": false,
        "table_type": "pivot"
    }
}
```

## Critical Data Integrity Rules

1. **COMPLETENESS**: Extract EVERY row, no matter how many (10, 100, 1000+)
2. **ACCURACY**: Preserve exact values, including formatting
3. **STRUCTURE**: Maintain consistent column count across all rows
4. **VALIDATION**: Ensure output is valid JSON that can be parsed
5. **ESCAPING**: Properly escape quotes and special characters in cell values

## Special Handling Instructions

### Large Tables
- Never abbreviate or summarize
- Never use "..." to indicate omitted rows
- Process every row even if it takes significant time
- The metadata row_count must match actual extracted rows

### Complex Merged Cells
- Track rowspan/colspan values carefully
- Ensure proper cell duplication
- Maintain data alignment across all rows

### Data Types
- Keep numbers as strings to preserve formatting
- Preserve currency symbols, percentages, etc.
- Handle empty cells as empty strings ""

### Error Prevention
- If a cell contains quotes, escape them properly
- Handle newlines within cells appropriately
- Ensure no JSON syntax errors

## Output Validation

Before returning results:
1. Verify JSON is valid and parseable
2. Confirm row count matches actual data
3. Check that all rows have same column count
4. Ensure all data is preserved without truncation

## JSON Schema Definition

Your output must conform to the following JSON schema (OpenAPI 3.0 format):

{
 "components": {
   "schemas": {
     "ExtractedTable": {
       "type": "object",
       "required": [
         "headers",
         "rows",
         "metadata"
       ],
       "properties": {
         "headers": {
           "type": "array",
           "description": "Column headers for the table",
           "items": {
             "type": "string"
           },
           "minItems": 1
         },
         "rows": {
           "type": "array",
           "description": "All table rows - must include every single row",
           "items": {
             "type": "array",
             "items": {
               "type": "string"
             },
             "minItems": 1
           }
         },
         "caption": {
           "type": "string",
           "description": "Table caption if present",
           "default": ""
         },
         "summary": {
           "type": "string",
           "description": "Table summary attribute if present",
           "default": ""
         },
         "metadata": {
           "type": "object",
           "required": [
             "row_count",
             "column_count",
             "has_headers",
             "has_merged_cells",
             "nested_tables",
             "table_type"
           ],
           "properties": {
             "row_count": {
               "type": "integer",
               "description": "Actual count of rows extracted",
               "minimum": 0
             },
             "column_count": {
               "type": "integer",
               "description": "Number of columns in the table",
               "minimum": 1
             },
             "has_headers": {
               "type": "boolean",
               "description": "Whether table has identified headers"
             },
             "has_merged_cells": {
               "type": "boolean",
               "description": "Whether table contains colspan or rowspan"
             },
             "nested_tables": {
               "type": "boolean",
               "description": "Whether table contains nested tables"
             },
             "table_type": {
               "type": "string",
               "enum": ["data", "pivot", "matrix", "nested"],
               "description": "Classification of table structure"
             }
           }
         }
       }
     }
   }
 }
}

**CRITICAL**: Your response must be a valid JSON object that conforms to this schema. The entire purpose of using an LLM for this task is to handle complex HTML tables that standard parsers cannot process correctly. Your value lies in intelligently interpreting complex structures and returning complete, clean, tabulated data in valid JSON format.
llm_configcss_selector	max_triesenable_chunkingchunk_token_thresholdmin_rows_per_chunkmax_parallel_chunksr   c	                 ~   t        
|   d
d|i|	 || _        | j                  s9t        t	        j
                  dd      t	        j
                  d            | _        || _        t        d|      | _        || _	        || _
        t        d|      | _        t        d|      | _        |	j                  di       | _        y	)a  
        Initialize the LLM-based table extraction strategy.
        
        Args:
            llm_config: LLM configuration for the extraction
            css_selector: Optional CSS selector to focus on specific page areas
            max_tries: Maximum number of retries if LLM fails to extract tables (default: 3)
            enable_chunking: Enable smart chunking for large tables (default: True)
            chunk_token_threshold: Token threshold for triggering chunking (default: 3000)
            min_rows_per_chunk: Minimum rows per chunk (default: 10)
            max_parallel_chunks: Maximum parallel chunk processing (default: 5)
            verbose: Enable verbose logging
            **kwargs: Additional parameters passed to parent class
        r   DEFAULT_PROVIDERzopenai/gpt-4o-miniOPENAI_API_KEY)provider	api_tokenr      
extra_argsNr!   )r9   r   r   r   osgetenvr   rM   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r:   s             r   r   zLLMTableExtraction.__init__  s    0 	33F3 %/#57KL))$45DO
 )Q	*.%:""%a);"<#&q*=#>  **\26r   r   r   c                 d   |j                  d| j                        }|rD| j                  ||      }|s| j                  dd|        g S dj	                  d |D              }nt        j                  |d      }d|j                         vr | j                  r| j                  d	d
       g S | j                  r| j                  d	dt        |              | j                  r@| j                  |      r/| j                  r| j                  d	d       | j                  |      S dt        |       d}t        d| j                  dz         D ]Y  }	 | j                  r(|dkD  r#| j                  d	d| d| j                   d       t!        | j"                  j$                  | j&                  dz   |z   dz   | j"                  j(                  | j"                  j*                  d| j"                  j,                  | j"                  j.                  | j"                  j0                  | j2                  	      }|ru|j4                  rh|j4                  d   j6                  j8                  }	| j                  rG| j                  ddt;        |	              t=        |	t>              r| j                  dd|	dd  d       t=        |	t>              rtA        jB                  |	      }
n|	}
t=        |
tD              r!d|
v r|
d   }
nd|
v r|
d   }
nd |
v r|
d    }
n|
g}
t=        |
tF              rXt        |
      dk(  rJt=        |
d   tF              r7|
d   }
t=        |
tF              r"t        |
      dk(  rt=        |
d   tF              r7t=        |
tF              s|
g}
| j                  r| j                  dd!t        |
       d"       g }|
D ]V  }| jI                  |      r!|jK                  | jM                  |             5| j                  sB| j                  dd#|        X |r1| j                  r!| j                  d	d$t        |       d%|        |c S || j                  k  r$| j                  r| j                  dd&| d'       *| j                  r | j                  dd(| j                   d)       g c S \ g S # t@        jN                  $ rP}| j                  r!| j                  d*d+| d,t?        |              || j                  k  rY d}~g cY d}~c S d}~wtP        $ r}| j                  rW| j                  d*d-| d,t?        |              || j                  k(  r'ddl)}| j                  dd.|jU                                 || j                  k  rddl+} |jX                  d       Y d}~Wg cY d}~c S d}~ww xY w)/a  
        Extract tables from HTML using LLM.
        
        Args:
            element: The HTML element to search for tables
            **kwargs: Additional parameters
            
        Returns:
            List of dictionaries containing extracted table data
        r   warningz$No elements found for CSS selector: r_   c              3   J   K   | ]  }t        j                  |d         yw)unicodeencodingN)r   tostring)rD   elems     r   rF   z4LLMTableExtraction.extract_tables.<locals>.<genexpr>  s     "jPT5>>$#K#K"js   !#r   r   z<tableinfoz%No <table> tags found in HTML contentz*Found table tags in HTML, content length: z9Content exceeds token threshold, using chunked extractionzFGENERATE THE TABULATED DATA from the following HTML content:

```html
R
```

Return only a JSON array of extracted tables following the specified format.r   Retry attempt /z for table extraction

z{

 MAKE SURE TO EXTRACT ALL DATA, DO NOT LEAVE ANYTHING FOR BRAVITY, YOUR GOAL IS TO RETURN ALL, NO MATTER HOW LONG IS DATAT	r   prompt_with_variablesr   base_urljson_response
base_delaymax_attemptsexponential_factorr   r   debugzLLM response type: zLLM response preview: N   z...resultrR   datazParsed z table(s) from LLM responsezTable failed validation: zSuccessfully extracted z tables using LLM on attempt z%No valid tables extracted on attempt z, retrying...z No valid tables extracted after z	 attemptsrH   zJSON parsing error on attempt : z)Error in LLM table extraction on attempt zTraceback: )-r   r   _css_to_xpath_selectr*   ri   r   r   rt   r   rB   r   _needs_chunking_extract_with_chunkingr   r   r   r   r   r   TABLE_EXTRACTION_PROMPTr   r   backoff_base_delaybackoff_max_attemptsbackoff_exponential_factorr   choicesr%   contenttype
isinstancer1   jsonloadsdictlist_validate_table_structurerN   _ensure_table_formatJSONDecodeErrorrO   	traceback
format_exctimesleep)r   r   r   r   selected_elementshtml_contentuser_promptattemptresponser   rP   validated_tablesrS   rV   r   r   s                   r   r#   z!LLMTableExtraction.extract_tables  sm    zz.$2C2CD  !% 9 9'< P$		)'KL>%Z[	77"jXi"jjL !>>'IFL <--//||		&$IKI<<IIf J3|K\J]^_ D$8$8$F||		&"]^..|<< |  MP Q 23 h	Gg<<GaKIIfwiq@PPe&fg ;!__55*.*F*F*OR]*]  a`  +`"oo77!__55"&#AA!%!E!E'+'Q'Q#
  0 0&..q199AAG||		'-@g+PQ%gs3 IIg1GPTQTVY/Z[ "'3/&*jj&9&- "+t4#{2*5h*?K%4*5h*?K#{2*5f*=K ,7-K %[$7C<LPQ<QV`almnaoquVv&1!n %[$7C<LPQ<QV`almnaoquVv &k48'2m||		'WS5E4FFa+bc (*$!, V99%@,33D4M4Me4TU!\\ IIi3LUG1TU	V (<< IIf0GL\H]G^^{  }D  |E  /F  G// /<< IIi3XY`Xaan1op << IIi3STXTbTbSccl1mn!	ah	V 	3 '' <<IIg)GyPRSVWXSYRZ'[\T^^+I <<IIg)RSZR[[]^abc^d]e'fg$..0(		'[9M9M9O8P+QR T^^+DJJqMIsX   H#R)3BR)7AR)1R)4-R))V/<<T>T?V/V/BV*!V*"V/*V/r   c                     	 d| j                   j                  j                         v r/t        j                  d      }t        |j                  |            S 	 t        |      dz  S #  Y xY w)zz
        Estimate token count for text.
        Uses tiktoken for OpenAI models, simple approximation for others.
        gptzgpt-3.5-turbo   )r   r   rt   tiktokenencoding_for_modelrB   encode)r   r   r   s      r   _estimate_tokensz#LLMTableExtraction._estimate_tokens  se    
	006688#66G8??4011 9 4yA~		s   AA& &A*r   c                     | j                   sy| j                  |      }|| j                  kD  }| j                  r%|r#| j	                  dd| d| j                   d       |S )zJ
        Check if table HTML needs chunking based on token count.
        Fr   zTable needs chunking: z
 tokens > z
 threshold)r   r   r   r   r*   )r   r   token_countneeds_chunks       r   r   z"LLMTableExtraction._needs_chunking  se     ##++L9!D$>$>><<KIIf 6{m:dNhNhMiistur   c                    t        j                         }t        j                  ||      }|j                  d      }|sg g g dfS |d   }g }|j                  d      }|r|d   j                  d      }n:|j                  d      D ]&  }|j                  d      r|j	                  |       & n t        |      dkD  }	g }
|j                  d      }|r|d   j                  d      }
g }|j                  d      }|r|d   j                  d      }n8|j                  d      }t        |      }t        |
      }|dkD  r|||  }n||d	 }|	s|s|d   j                  d      }|||
|	fS )
z
        Extract headers, body rows, and footer from table HTML.
        
        Returns:
            Tuple of (header_rows, body_rows, footer_rows, has_headers)
        r<   Fr   rX   rb   r[   z.//tfootrY   N)r   
HTMLParser
fromstringrJ   rN   rB   )r   r   parsertreerR   rS   header_rowstheadrE   r   footer_rowstfoot	body_rowstbodyall_rowsheader_countfooter_counts                    r   _extract_table_structurez+LLMTableExtraction._extract_table_structure  s    !!#f5 J'r2u$$q	 J'(..1K {{7+ 99W%&&s+	 +&* J'(..1K 	J'aw/I {{7+H{+L{+La$\<-@	$\]3	 5q	0II{K??r   c                    | j                   r| j                  ddt        |       d       | j                  |      \  }}}}| j                   r7| j                  ddt        |       dt        |       dt        |       d       |s#| j                   r| j                  dd       |g|fS d	}|rDt	        j
                  d
      }|D ]  }|j                  |        t	        j                  |d      }g }	g }
| j                  |      }|D ]  }t	        j                  |d      }| j                  |      }|
rN||z   | j                  kD  r<| j                  ||
d      }|	j                  |       |g}
| j                  |      |z   }{|
j                  |       ||z  } |
rld}|rDt	        j
                  d      }|D ]  }|j                  |        t	        j                  |d      }| j                  ||
|      }|	j                  |       t        |	      dkD  r| j                  |	| j                        }	| j                   r| j                  ddt        |	       d       |	|fS )z
        Create smart chunks of table HTML, preserving headers in each chunk.
        
        Returns:
            Tuple of (chunks, has_headers)
        r   zCreating smart chunks from z characters of HTMLzTable structure: z header rows, z body rows, z footer rowsz-No body rows to chunk, returning full contentr_   r   r   r   Nr   r   Created z chunks for parallel processing)r   r*   rB   r  r   r0   rN   r   r   r   _create_chunk_html_rebalance_chunksr   )r   r   r   r   r   r   header_htmlthead_elementrE   chunkscurrent_chunk_rowscurrent_token_countrow_html
row_tokens
chunk_htmlfooter_htmltfoot_elements                    r   _create_smart_chunksz'LLMTableExtraction._create_smart_chunks  sy    <<IIf ;C<M;NNabc;?;X;XYe;f8Y[<<IIf 1#k2B1C>RUV_R`Qaamnqr}n~m  @L  M  N||		&"QR >;.. !MM'2M" *$$S)*..KK "33K@ 	2C~~cI>H..x8J "':Z'G$JdJd'd!44[BTVZ[
j) '/Z"&*&;&;K&H:&U#"))(3#z1#	2$ K %g 6& .C!((-.#nn]YO00>PR]^JMM*% v;?++FD4K4KLF<<IIfV5TUV{""r   r  r   r  c                     dg}|r|j                  |       |j                  d       |j                  |       |j                  d       |r|j                  |       |j                  d       dj                  |      S )zb
        Create a complete table HTML chunk with headers, body rows, and optional footer.
        z<table>z<tbody>z</tbody>z</table>r_   )rN   r   ri   )r   r  r   r  
html_partss        r   r  z%LLMTableExtraction._create_chunk_html#  st      [
k*)$)$*%k**%wwz""r   r	  r7   c                     |S )zm
        Rebalance chunks to ensure minimum rows per chunk.
        Merge small chunks if necessary.
        r!   )r   r	  r7   s      r   r  z$LLMTableExtraction._rebalance_chunks7  s	     r   r  chunk_indextotal_chunksr   c                    | j                   r| j                  dd|dz    d|        d}|sd}d|dz    d| d	| d
t        |       d	}t        d| j                  dz         D ]!  }	 | j                   r-|dkD  r(| j                  dd| d| j                   d|dz           t        | j                  j                  | j                  dz   |z   | j                  j                  | j                  j                  d| j                  j                  | j                  j                  | j                  j                  | j                  	      }|r;|j                  r.|j                  d   j                   j"                  }	t%        |	t&              rt)        j*                  |	      }
n|	}
t%        |
t,              r!d|
v r|
d   }
nd|
v r|
d   }
nd|
v r|
d   }
n|
g}
t%        |
t.              rXt1        |
      dk(  rJt%        |
d   t.              r7|
d   }
t%        |
t.              r"t1        |
      dk(  rt%        |
d   t.              r7t%        |
t.              s|
g}
|
D ]+  }| j3                  |      s|| j5                  |      dc c S  |ddc S $ |ddS # t6        $ ru}| j                   r$| j                  dd|dz    dt'        |              || j                  k  rt9        j:                  d       Y d}~|dt'        |      dcY d}~c S d}~ww xY w)z6
        Process a single chunk with the LLM.
        r   zProcessing chunk r   r   r_   zs
IMPORTANT: This table has NO headers. Return an empty array for 'headers' field and extract all rows as data rows.z6Extract table data from this HTML chunk.
This is part z of zA of a larger table.
Focus on extracting the data rows accurately.z


```html
r   r   z for chunk r   Tr   r   r   rR   r   )r  rS   NrH   zError processing chunk r   r  rS   rH   )r   r*   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r%   r   r   r1   r   r   r   r   rB   r   r   rO   r   r   )r   r  r  r  r   header_contextchunk_promptr   r   r   rP   rS   rV   s                r   _process_chunkz!LLMTableExtraction._process_chunk@  s&    <<IIf 1+/1B!L>RS  TNAod<. 1..<-= > z  MP Q 23 ?	XG>X<<GaKIIfwiq@PP[\gjk\k[l&mn:!__55*.*F*F*OR^*^"oo77!__55"&#AA!%!E!E'+'Q'Q#
  0 0&..q199AAG "'3/&*jj&9&- "+t4#{2*5h*?K%4*5h*?K#{2*5f*=K+6-K %[$7C<LPQ<QV`almnaoquVv&1!n %[$7C<LPQ<QV`almnaoquVv &k48'2m "- 99%@/:)-)B)B5)I$  ,7FFk?	XB  +T::  X<<IIg)@q@QQSTWXYTZS['\]T^^+JJqM+6PSTUPVWWXs8   'GJ:)J$J<J	L
AL/L=L
L
chunk_resultsc           	         |j                  d        |D cg c]  }|j                  d      s| }}|sg S |d   d   j                         }g }|D ]*  }|d   }|j                  dg       }|j                  |       , ||d<   t	        |      |d   d<   d|d   d	<   t	        |      |d   d
<   | j
                  r+| j                  ddt	        |       dt	        |       d       |gS c c}w )zI
        Merge results from multiple chunks into a single table.
        c                 &    | j                  dd      S )Nr  r   )r   )xs    r   <lambda>z9LLMTableExtraction._merge_chunk_results.<locals>.<lambda>  s    }a)@ r   )keyrS   r   r=   r   r   Tchunkedchunk_countr   zMerged z chunks into table with z rows)sortr   copyr   rB   r   r*   )	r   r  rvalid_chunksmerged_tabler   chunk_resultrS   r=   s	            r   _merge_chunk_resultsz'LLMTableExtraction._merge_chunk_results  s   
 	@A $1CaAEE'NCCI $Aw/446 ( 	"L )E99VR(DOOD!		"  (V 14HZ -.2Z +25l2CZ /<<IIfL(9'::RSVW_S`Raafgh~5 Ds
   C)C)c                    | j                   r| j                  ddt        |       d       | j                  |      \  }}| j                   r| j                  ddt        |       d       t        |      dk(  rN| j                   r| j                  dd       | j	                  |d   dd|      }|j                  d	      r|d	   gS g S | j                   r,| j                  dd
t        |       d| j                   d       g }t        | j                        5 }t        |      D ci c].  \  }}|j                  | j                  ||t        |      |      |0 }	}}t        |	      D ]\  }
|	|
   }	 |
j                  d      }| j                   r%| j                  dd|dz    dt        |       d       |j                  |       ^ 	 ddd       | j                   r| j                  dd       | j                  |      S c c}}w # t        $ re}| j                   r0| j                  dd|dz    dt        |       dt        |              |j                  |dt        |      d       Y d}~d}~ww xY w# 1 sw Y   xY w)zH
        Extract tables using chunking and parallel processing.
        r   z-Starting chunked extraction for content with z charactersr  z chunk(s) for processingr   z6Processing as single chunk (no parallelization needed)r   rS   zProcessing z" chunks in parallel (max workers: ))max_workers<   )timeoutzChunk r   z completed successfullyrH   z processing failed: Nr  z(All chunks processed, merging results...)r   r*   rB   r  r  r   r   r   	enumeratesubmitr   r   rN   rO   r1   r*  )r   r   r	  r   r   r  executorr   chunkfuturesfuturer  rV   s                r   r   z)LLMTableExtraction._extract_with_chunking  sy    <<IIf McR^N_M``klm #77E<<IIfV5MNOv;!||		&"Z[((Aq+FF(.

7(;F7O$CC <<IIfCK=8Z[_[s[sZttuvwD,D,DE 	g !*& 1Au  3 3UAs6{KXZ[[G  'w/ 
g%fog#]]2]6F||		&F;?2C1S[MQh*ij!((0
g	g( <<IIf HJ ((77- ! g||		'VK!O3DAc&k]Rfgjklgmfn+op!((t^abc^d)effg	g 	gsJ   I5&3G>I50AHI5>I5	I2AI-'I5-I22I55I>c           
         |}|j                  d      r
d|dd  d}nr|j                  d      r
d|dd  d}nWd|v r%|j                  d      }|d	   }|d   }d
| d| d}n.d|v r%|j                  d      }|d	   }|d   }d
| d| d}nd
| }	 |j                  |      S # t        $ r-}| j	                  dd| dt        |              g cY d}~S d}~ww xY w)am  
        Convert CSS selector to XPath and select elements.
        This is a basic implementation - for complex CSS selectors, 
        consider using cssselect library.
        
        Args:
            element: Root element to search from
            css_selector: CSS selector string
            
        Returns:
            List of selected elements
        #z
.//*[@id='r   Nz'].z.//*[contains(@class, 'z')]r   z.//z[contains(@class, 'z[@id='r   z&XPath conversion failed for selector 'z': )rr   splitrJ   rO   r*   r1   )	r   r   r   rJ   partselement_name
class_nameid_valuerV   s	            r   r   z'LLMTableExtraction._css_to_xpath_select  s,     ""3' ab!1 2"5E$$S)-l12.>-?sCEL  &&s+E 8LqJ,'::,cJEL  &&s+E 8LQxH,vhZr:E ,(E	=='' 	IIi#I,WZ[^_`[aZb!cdI	s   B" "	C+"CCCrS   c                     t        |t              syd|vsd|vry|j                  d      }t        |t              syt        |t              r`t	        |      dk(  rRt        |d   t              r?|d   |d<   |d   }t        |t              r"t	        |      dk(  rt        |d   t              r?|j                  d      }t        |t              syg }|D ]{  }t        |t              rXt	        |      dk(  rJt        |d   t              r7|d   }t        |t              r"t	        |      dk(  rt        |d   t              r7|j                  |       } ||d<   |j                  dg       D ]  }t        |t              r y y)z
        Validate that the table has the required structure.
        
        Args:
            table: Table dictionary to validate
            
        Returns:
            True if valid, False otherwise
        Fr>   r=   r   r   T)r   r   r   r   rB   rN   )r   rS   r>   r=   cleaned_rowsrE   s         r   r   z,LLMTableExtraction._validate_table_structure  so    %& E!V5%8 ))I&'4( $'CLA,=*WUVZY]B^&qzE)I&G $'CLA,=*WUVZY]B^
 yy $%  	%CS$'CHMjQQU>V!f S$'CHMjQQU>V$		%
 %f 99VR( 	Cc4(	 r   c                 ,   |j                  dg       |j                  dg       |j                  dd      |j                  dd      |j                  di       d}|d   si |d<   |d   }d|vrt        |d         |d<   d	|vrt        |d         |d	<   d
|vrt        |d         |d
<   t        |d         }|dkD  rZt        |d         D ]I  \  }}t        |      |k  r|dg|t        |      z
  z  z   |d   |<   0t        |      |kD  s?|d| |d   |<   K |S )z
        Ensure the table has all required fields with proper defaults.
        
        Args:
            table: Table dictionary to format
            
        Returns:
            Properly formatted table dictionary
        r>   r=   r   r_   rg   r   r   r   r   r   r   N)r   rB   r   r0  )r   rS   formatted_tabler   rU   r   rE   s          r   r   z'LLMTableExtraction._ensure_table_formatO  sU    yyB/IIfb)yyB/yyB/		*b1
 z**,OJ' #:.h&$'(?$@H[!)'*?9+E'FH^$(&*?9+E&FH]# 	23	q=#OF$;< A3s8i'14ty3s8?S7T1TOF+A.X	)14ZiOF+A.A r   )NNr]   Ti  rp   r   F)T)r,   r-   r.   r/   r   r   r   r1   r   r   r   r   r0   r   r   r   r#   r   r   r
   r  r  r  r  r  r*  r   r   r   r   r   r   s   @r   r   r   @  s   f`R 48/3"#)-.2+-,-!&)7%i0)7'})7  )7 #'	)7
 ),)7 &))7 '*)7 )7V`emm `$tCQTH~BV `DS S  C D ;@S ;@U4CVX\]b]j]jXkmqrwrr  nA  CG  DG  >H ;@zF# F#tCy$9O F#P#c #d3i #V^_bVc #hk #(S	 S T#Y X; X;3 X;c X;`d X;ptuxz}u}p~ X;t"$tCH~2F "4PTUXZ]U]P^K_ "H183 184S#X;O 18f+EMM + +QUV[VcVcQd +Z/t / /b+$ +4S> +r   r   )r/   abcr   r   typingr   r   r   r   r	   r
   lxmlr   rer   typesr   r   utilsr   r   r   concurrent.futuresr   r   r   r   r   r3   r   r   r!   r   r   <module>rI     si    $ : :  	  / A 	 ?  *?c *?Zf
4 f
R/ *z0 zr   