
    ֩i4              	          d Z ddlZddlmc mZ ddlZddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZmZ 	 	 ddededed	efd
Zded	e	fdZd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z d Z!d Z"d Z#d Z$d Z%d Z&d Z'd Z(d Z)y) u>  
Story 2.07 — Extractor Integration Tests
==========================================
19 tests covering all extractor functions.
All fixtures use pre-built HTML strings — no network calls.

# VERIFICATION_STAMP
# Story: 2.07
# Verified By: parallel-builder
# Verified At: 2026-02-26
# Tests: 19/19
# Coverage: 100%
    NBeautifulSoup)FetchedPageExtractedContent)extract_from_htmlextract_headingsextract_code_blocksextract_tablesextract_with_readabilityextract_batchhtmlurlcontent_typereturnc                 $    t        || d|i d      S )N   z2026-02-26T00:00:00Z)r   r   status_coder   headers
fetched_at)r   )r   r   r   s      ?/mnt/e/genesis-system/tests/kb/test_m2_extractor_integration.py
_make_pager   "   s!    !)     c                     t        | d      S )Nlxmlr   )r   s    r   _soupr   .   s    v&&r   c                     d} t        |       }t        |      }t        |t              }|sddt	        j
                         v st        j                  t              rt        j                  t              nddt	        j
                         v st        j                  |      rt        j                  |      nddt	        j
                         v st        j                  t              rt        j                  t              ndt        j                  |      dz  }t        t        j                  |            d}|j                  }d}||k(  }|st        j                  d	|fd
||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd|iz  }t        t        j                  |            dx}x}}d}	|j                  }|	|v }
|
st        j                  d|
fd|	|f      t        j                  |	      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      dz  }dd|iz  }t        t        j                  |            dx}	x}
}|j                  }|j                  }||k(  }|st        j                  d	|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      dz  }dd|iz  }t        t        j                  |            dx}x}}y)z6Basic HTML page produces clean text and correct title.zp<html>
    <head><title>Hello World</title></head>
    <body><p>This is a test paragraph.</p></body>
    </html>5assert %(py4)s
{%(py4)s = %(py0)s(%(py1)s, %(py2)s)
}
isinstanceresultr   py0py1py2py4NzHello World==z-%(py2)s
{%(py2)s = %(py0)s.title
} == %(py5)sr!   r#   py5assert %(py7)spy7ztest paragraphinz,%(py1)s in %(py5)s
{%(py5)s = %(py3)s.text
}r"   py3r)   )zD%(py2)s
{%(py2)s = %(py0)s.url
} == %(py6)s
{%(py6)s = %(py4)s.url
}page)r!   r#   r$   py6assert %(py8)spy8)r   r   r   r   @py_builtinslocals
@pytest_ar_should_repr_global_name	_safereprAssertionError_format_explanationtitle_call_reprcomparetextr   )r   r1   r   @py_assert3@py_format5@py_assert1@py_assert4@py_format6@py_format8@py_assert0@py_assert2@py_assert5@py_format7@py_format9s                 r   test_extract_simple_htmlrJ   6   s   D dDt$Ff.////////:///://////f///f//////.///.//////////<<(=(<=((((<=((((((6(((6(((<(((=(((((((*v{{*{****{*********v***v***{*******::!!:!!!!:!!!!!!6!!!6!!!:!!!!!!!!!!!!!!!!!!!r   c                     d} t        |       }t        |      }d}|j                  }||v}|st        j                  d|fd||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      ndt        j
                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}d
}|j                  }||v}|st        j                  d|fd||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      ndt        j
                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}d}|j                  }||v }|st        j                  d|fd||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      ndt        j
                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}y	)z=HTML with <script>/<style> tags produces text without JS/CSS.z<html>
    <head>
        <title>Strip Test</title>
        <style>body { color: red; }</style>
    </head>
    <body>
        <script>alert('xss')</script>
        <p>Real content here.</p>
    </body>
    </html>alertnot inz0%(py1)s not in %(py5)s
{%(py5)s = %(py3)s.text
}r   r/   r*   r+   Nz
color: redzReal contentr,   r.   r   r   r>   r7   r=   r9   r5   r6   r8   r:   r;   r   r1   r   rE   rB   rF   rC   rD   s           r   test_extract_strips_scriptsrR   E   sf   	D dDt$F%&++%7+%%%%7+%%%7%%%%%%&%%%&%%%+%%%%%%%*v{{*<{****<{***<******v***v***{*******(V[[(>[((((>[(((>((((((V(((V((([(((((((r   c                     t        d      } t        |       }t        |t              }|sddt	        j
                         v st        j                  t              rt        j                  t              nddt	        j
                         v st        j                  |      rt        j                  |      nddt	        j
                         v st        j                  t              rt        j                  t              ndt        j                  |      dz  }t        t        j                  |            d}|j                  }d}||k(  }|st        j                  d|fd	||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      d
z  }dd|iz  }t        t        j                  |            dx}x}}|j                  }d}||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      d
z  }dd|iz  }t        t        j                  |            dx}x}}y)z8Empty HTML does not crash; returns empty text and title. r   r   r   r   r    Nr%   r'   r(   r*   r+   )z,%(py2)s
{%(py2)s = %(py0)s.text
} == %(py5)s)r   r   r   r   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   )r1   r   r?   r@   rA   rB   rC   rD   s           r   test_extract_empty_htmlrU   Y   sz   b>Dt$Ff.////////:///://////f///f//////.///.//////////<<2<2<266<2;;";";"66;"r   c                     d} t        |       }t        |      }|j                  }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}y	)
z7<title> tag is preferred over <h1> for the title field.zu<html>
    <head><title>Page Title</title></head>
    <body><h1>Different H1</h1><p>Body text.</p></body>
    </html>z
Page Titler%   r'   r   r(   r*   r+   Nr   r   r<   r7   r=   r5   r6   r8   r9   r:   r;   r   r1   r   rA   rB   r?   rC   rD   s           r   test_title_from_title_tagrY   c   s    D dDt$F<<'<'<<''''<<''''''6'''6'''<'''<'''''''r   c                     d} t        |       }t        |      }|j                  }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}y	)
z1When there is no <title>, the first <h1> is used.z^<html>
    <head></head>
    <body><h1>Fallback Title</h1><p>Body text.</p></body>
    </html>zFallback Titler%   r'   r   r(   r*   r+   NrW   rX   s           r   test_title_from_h1r[   o   s    D dDt$F<<+++<+++++<+++++++6+++6+++<+++++++++++r   c                     d} t        |       }t        |      }d}|j                  }||v}|st        j                  d|fd||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      ndt        j
                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}d
}|j                  }||v}|st        j                  d|fd||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      ndt        j
                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}d}|j                  }||v }|st        j                  d|fd||f      t        j
                  |      dt        j                         v st        j                  |      rt        j
                  |      ndt        j
                  |      dz  }dd|iz  }t        t        j                  |            d	x}x}}y	)z;<nav> and <footer> content is excluded from extracted text.z<html>
    <head><title>Clean Page</title></head>
    <body>
        <nav>Menu item one | Menu item two</nav>
        <p>Actual article content.</p>
        <footer>Copyright 2026 Sunaiva</footer>
    </body>
    </html>z	Menu itemrM   rO   r   r/   r*   r+   NzCopyright 2026zActual article contentr,   r.   rP   rQ   s           r   test_nav_footer_removalr]   {   sl   D dDt$F)fkk);k))));k)));))))))f)))f)))k))))))).6;;.;....;.........6...6...;.......#2v{{2#{2222#{222#222222v222v222{2222222r   c                  P   d} t        t        |             }t        |      }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd	|iz  }t        t        j                  |            d
x}x}}|d   }d}||k(  }|slt        j                  d|fd||f      t        j                  |      t        j                  |      dz  }	dd|	iz  }t        t        j                  |            d
x}x}}|d   }d}||k(  }|slt        j                  d|fd||f      t        j                  |      t        j                  |      dz  }	dd|	iz  }t        t        j                  |            d
x}x}}|d   }d}||k(  }|slt        j                  d|fd||f      t        j                  |      t        j                  |      dz  }	dd|	iz  }t        t        j                  |            d
x}x}}y
)u9   H1→H2→H3 nesting produces correct breadcrumb entries.z_<html><body>
    <h1>Introduction</h1>
    <h2>Setup</h2>
    <h3>Linux</h3>
    </body></html>   r%   z0%(py3)s
{%(py3)s = %(py0)s(%(py1)s)
} == %(py6)slenheadingsr!   r"   r0   r2   r3   r4   Nr   zH1: Introduction)z%(py1)s == %(py4)sr"   r$   assert %(py6)sr2      zH1: Introduction > H2: Setup   z(H1: Introduction > H2: Setup > H3: Linuxr   r   ra   r7   r=   r5   r6   r8   r9   r:   r;   
r   rb   rF   rG   rB   rH   rI   rE   r?   r@   s
             r   test_heading_hierarchyrj      s   D
  d,Hx=A=A=A33xx=AA;,,,;,,,,,;,,,,;,,,,,,,,,,,A;888;88888;8888;88888888888A;DDD;DDDDD;DDDD;DDDDDDDDDDDr   c                  |   d} t        t        |             }g }||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      dz  }dd|iz  }t        t        j                  |            dx}}y)	z2A page with no heading tags returns an empty list.z2<html><body><p>No headings here.</p></body></html>r%   z%(py0)s == %(py3)srb   r!   r0   assert %(py5)sr)   N)
r   r   r7   r=   r5   r6   r8   r9   r:   r;   )r   rb   rF   rA   @py_format4rC   s         r   test_no_headingsrp      sm    ?Dd,H8r>8r88rr   c                  J   d} t        t        |             }t        |      }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd	|iz  }t        t        j                  |            d
x}x}}d}|d   }||v }|slt        j                  d|fd||f      t        j                  |      t        j                  |      dz  }	dd|	iz  }t        t        j                  |            d
x}x}}d}|d   }||v}|slt        j                  d|fd||f      t        j                  |      t        j                  |      dz  }	dd|	iz  }t        t        j                  |            d
x}x}}d}|d   }||v}|slt        j                  d|fd||f      t        j                  |      t        j                  |      dz  }	dd|	iz  }t        t        j                  |            d
x}x}}y
)zAHeadings containing inner <a> or <strong> yield clean plain text.zZ<html><body>
    <h2><a href="/x"><strong>Nested</strong> Link</a></h2>
    </body></html>rf   r%   r`   ra   rb   rc   r3   r4   NzNested Linkr   r,   z%(py1)s in %(py4)srd   re   r2   z<arM   )z%(py1)s not in %(py4)sz<strongrh   ri   s
             r   test_heading_text_cleanuprs      s   D  d,Hx=A=A=A33xx=A'HQK'=K''''=K'''='''K'''''''"x{"4{""""4{"""4"""{"""""""'HQK'9K''''9K'''9'''K'''''''r   c                     d} t        t        |             }t        |      }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd	|iz  }t        t        j                  |            d
x}x}}|d   }|j                  }d} ||      }|stdt        j                  |      t        j                  |      t        j                  |      t        j                  |      dz  }	t        t        j                  |	            d
x}x}x}}d}|d   }
||
v }|slt        j                  d|fd||
f      t        j                  |      t        j                  |
      dz  }dd|iz  }t        t        j                  |            d
x}x}}
|d   }|j                  }d} ||      }|stdt        j                  |      t        j                  |      t        j                  |      t        j                  |      dz  }	t        t        j                  |	            d
x}x}x}}y
)zC<pre><code class="language-python"> is extracted with python fence.zc<html><body>
    <pre><code class="language-python">print("hello")
</code></pre>
    </body></html>rf   r%   r`   ra   blocksrc   r3   r4   Nr   z	```pythonLassert %(py7)s
{%(py7)s = %(py3)s
{%(py3)s = %(py1)s.startswith
}(%(py5)s)
}r"   r0   r)   r+   zprint("hello")r,   rr   rd   re   r2   ```zJassert %(py7)s
{%(py7)s = %(py3)s
{%(py3)s = %(py1)s.endswith
}(%(py5)s)
})r	   r   ra   r7   r=   r5   r6   r8   r9   r:   r;   
startswithendswith)r   ru   rF   rG   rB   rH   rI   rE   @py_assert6rD   r?   r@   s               r   test_code_block_extractionr|      s   D !t-Fv;!;!;!33vv;!!9,9,,,,,,,9,,,,,,,,,,,,,,,,(vay(y((((y((((((y(((((((!9$9$e$e$$$$$9$$$$$$e$$$$$$$$$$r   c                     d} t        t        |             }t        |      }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd	|iz  }t        t        j                  |            d
x}x}}|d   }|j                  }d} ||      }|stdt        j                  |      t        j                  |      t        j                  |      t        j                  |      dz  }	t        t        j                  |	            d
x}x}x}}|d   }|j                  }d} ||      }|stdt        j                  |      t        j                  |      t        j                  |      t        j                  |      dz  }	t        t        j                  |	            d
x}x}x}}|d   }|j                  }d} ||      }|stdt        j                  |      t        j                  |      t        j                  |      t        j                  |      dz  }	t        t        j                  |	            d
x}x}x}}y
)z3Three <pre><code> blocks yield exactly three items.z<html><body>
    <pre><code class="language-js">console.log(1)</code></pre>
    <pre><code class="language-bash">echo hi</code></pre>
    <pre><code>raw block</code></pre>
    </body></html>r_   r%   r`   ra   ru   rc   r3   r4   Nr   z```jsrv   rw   rf   z```bashrg   rx   )r	   r   ra   r7   r=   r5   r6   r8   r9   r:   r;   ry   )
r   ru   rF   rG   rB   rH   rI   rE   r{   rD   s
             r   test_multiple_code_blocksr~      s   D
 !t-Fv;!;!;!33vv;!!9(9(((((((9((((((((((((((((!9*9*	*	*****9******	**********!9&9&&&&&&&9&&&&&&&&&&&&&&&&r   c                  |   d} t        t        |             }g }||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      dz  }dd|iz  }t        t        j                  |            dx}}y)	zCInline <code> tags (outside <pre>) are NOT included in code_blocks.zO<html><body>
    <p>Use <code>print()</code> to display.</p>
    </body></html>r%   rl   ru   rm   rn   r)   N)
r	   r   r7   r=   r5   r6   r8   r9   r:   r;   )r   ru   rF   rA   ro   rC   s         r   test_inline_code_excludedr      sq    D !t-F6R<6R66Rr   c                     d} t        t        |             }t        |      }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd	|iz  }t        t        j                  |            d
x}x}}|d   j                         }d}|d   }	||	v }|slt        j                  d|fd||	f      t        j                  |      t        j                  |	      dz  }
dd|
iz  }t        t        j                  |            d
x}x}}	g }d}|d   }||v }|}|rd}|d   }||v }|}|rd}|d   }||v }|}|s_t        j                  d|fd||f      t        j                  |      t        j                  |      dz  }dd	|iz  }|j                  |       |rt        j                  dfdf      t        j                  |      t        j                  |      dz  }dd|iz  }|j                  |       |r_t        j                  dfdf      t        j                  |      t        j                  |      dz  }dd|iz  }|j                  |       t        j                  |d      i z  }d d!|iz  }t        t        j                  |            d
x}x}x}x}x}x}x}x}x}x}}d"}|d   }	||	v }|slt        j                  d|fd||	f      t        j                  |      t        j                  |	      dz  }
dd|
iz  }t        t        j                  |            d
x}x}}	y
)#z0A 3x3 table is converted to pipe-delimited text.z<html><body>
    <table>
        <tr><td>A</td><td>B</td><td>C</td></tr>
        <tr><td>1</td><td>2</td><td>3</td></tr>
        <tr><td>4</td><td>5</td><td>6</td></tr>
    </table>
    </body></html>rf   r%   r`   ra   tablesrc   r3   r4   Nr   |r,   rr   rd   re   r2   ABC)z%(py3)s in %(py6)s)r0   r2   z%(py8)s)z%(py11)s in %(py14)s)py11py14z%(py16)spy16)z%(py19)s in %(py22)s)py19py22z%(py24)spy24zassert %(py27)spy27---)r
   r   ra   r7   r=   r5   r6   r8   r9   r:   r;   
splitlinesappend_format_boolop)r   r   rF   rG   rB   rH   rI   linesrE   r?   r@   rA   @py_assert10@py_assert13@py_assert12@py_assert18@py_assert21@py_assert20@py_format15@py_format17@py_format23@py_format25@py_format26@py_format28s                           r   test_simple_tabler      sH   D E$K(Fv;!;!;!33vv;!1I  "E%(3(?3(3(B3B%(B3(?BsBeAhBshB3B%(B3(?BBBB3(BBB3BBB(BBBBBBBshBBBsBBBhBBBBBBB3(BBB3BBB(BBBBBBBBBBBBBBBE!H5H5H5Hr   c                     d} t        t        |             }t        |      }d}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd	|iz  }t        t        j                  |            d
x}x}}|d   }|j                         }d}	|d   }
|	|
v }|slt        j                  d|fd|	|
f      t        j                  |	      t        j                  |
      dz  }dd|iz  }t        t        j                  |            d
x}	x}}
d}	|d   }
|	|
v }|slt        j                  d|fd|	|
f      t        j                  |	      t        j                  |
      dz  }dd|iz  }t        t        j                  |            d
x}	x}}
d}	|d   }
|	|
v }|slt        j                  d|fd|	|
f      t        j                  |	      t        j                  |
      dz  }dd|iz  }t        t        j                  |            d
x}	x}}
d}	|	|v }|st        j                  d|fd|	|f      t        j                  |	      dt        j                         v st        j                  |      rt        j                  |      nddz  }dd|iz  }t        t        j                  |            d
x}	}d}	|	|v }|st        j                  d|fd|	|f      t        j                  |	      dt        j                         v st        j                  |      rt        j                  |      nddz  }dd|iz  }t        t        j                  |            d
x}	}y
)z><thead> row is treated as header with separator line after it.a  <html><body>
    <table>
        <thead>
            <tr><th>Name</th><th>Age</th></tr>
        </thead>
        <tbody>
            <tr><td>Alice</td><td>30</td></tr>
            <tr><td>Bob</td><td>25</td></tr>
        </tbody>
    </table>
    </body></html>rf   r%   r`   ra   r   rc   r3   r4   Nr   Namer,   rr   rd   re   r2   Ager   Alicez%(py1)s in %(py3)sr>   r"   r0   rn   r)   Bob)r
   r   ra   r7   r=   r5   r6   r8   r9   r:   r;   r   )r   r   rF   rG   rB   rH   rI   r>   r   rE   r?   r@   ro   rC   s                 r   test_table_with_headersr      sT   
D E$K(Fv;!;!;!33vv;!!9DOOEU1X6X6X6XE!H5H5H5HE!H5H5H5H7d?7d7dd5D=5D5DDr   c                     dj                  d t        dd      D              } d|  d}t        |      }d}||v }|st        j                  d|fd	||f      t        j
                  |      d
t        j                         v st        j                  |      rt        j
                  |      nd
dz  }dd|iz  }t        t        j                  |            dx}}t        |      }d}||kD  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j
                  t              ndd
t        j                         v st        j                  |      rt        j
                  |      nd
t        j
                  |      t        j
                  |      dz  }	dd|	iz  }
t        t        j                  |
            dx}x}}y)z:A long article page produces meaningful main-content text.
c              3   (   K   | ]
  }d | d  yw)z<p>Paragraph zI: Genesis is an autonomous agentic system built for revenue at scale.</p>N ).0is     r   	<genexpr>z4test_readability_extracts_article.<locals>.<genexpr>  s"       scds   rf      z<html>
    <head><title>Article</title></head>
    <body>
        <nav>Sidebar | Nav links | More nav</nav>
        <article>
            <h1>Main Article Title</h1>
            zj
        </article>
        <footer>Footer content that should not appear</footer>
    </body>
    </html>z'Genesis is an autonomous agentic systemr,   r   r>   r   rn   r)   Nd   )>)z/%(py3)s
{%(py3)s = %(py0)s(%(py1)s)
} > %(py6)sra   rc   r3   r4   )joinranger   r7   r=   r9   r5   r6   r8   r:   r;   ra   )
paragraphsr   r>   rE   rF   ro   rC   rG   rB   rH   rI   s              r   !test_readability_extracts_articler     s2     q" J L 
D $D)D 5<4<<<<4<<<4<<<<<<<<<<<<<<<<t9s9s?9s33tt9sr   c                     d} t        |       }g }d}||v }|}|st        |      }d}||k\  }|}|st        j                  d|fd||f      t        j                  |      dt        j                         v st        j                  |      rt        j                  |      nddz  }	dd	|	iz  }
|j                  |
       |st        j                  d
fdf      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd|iz  }|j                  |       t        j                  |d      i z  }dd|iz  }t        t        j                  |            dx}x}x}x}x}x}}y)zFWhen readability produces empty output, body text is returned instead.z7<html><body><p>Fallback text content.</p></body></html>zFallback text contentr   r,   )z%(py3)s in %(py5)sr>   )r0   r)   z%(py7)sr+   )>=)z4%(py12)s
{%(py12)s = %(py9)s(%(py10)s)
} >= %(py15)sra   )py9py10py12py15z%(py17)spy17rf   zassert %(py20)spy20N)r   ra   r7   r=   r9   r5   r6   r8   r   r   r:   r;   )r   r>   rA   rF   rB   rE   @py_assert11@py_assert14r   rC   rD   @py_format16@py_format18@py_format19@py_format21s                  r   test_readability_fallbackr   /  s    ED#D)D ="<"d*<c$i<1<i1n<<<<"d<<<"<<<<<<d<<<d<<<<<<<i1<<<<<<c<<<c<<<<<<$<<<$<<<i<<<1<<<<<<<<<<<<<<r   c            
         t        dd      D  cg c]  } t        d|  d|  dd|         }} t        |      }t        |      }d}||k(  }|st	        j
                  d	|fd
||f      dt        j                         v st	        j                  t              rt	        j                  t              nddt        j                         v st	        j                  |      rt	        j                  |      ndt	        j                  |      t	        j                  |      dz  }dd|iz  }t        t	        j                  |            dx}x}}t        |d      D ]  \  } }t        |t              }	|	sddt        j                         v st	        j                  t              rt	        j                  t              nddt        j                         v st	        j                  |      rt	        j                  |      nddt        j                         v st	        j                  t              rt	        j                  t              ndt	        j                  |	      dz  }
t        t	        j                  |
            d}	|j                  }d|  }||k(  }	|	st	        j
                  d	|	fd||f      dt        j                         v st	        j                  |      rt	        j                  |      ndt	        j                  |      t	        j                  |      dz  }dd|iz  }t        t	        j                  |            dx}x}	} yc c} w )z>Three valid HTML pages produce three ExtractedContent results.rf      z<html><head><title>Page z</title></head><body><p>Body z.</p></body></html>zhttps://example.com/r   r_   r%   r`   ra   resultsrc   r3   r4   N)startr   r   r   r   r    zPage r'   r(   r*   r+   )r   r   r   ra   r7   r=   r5   r6   r8   r9   r:   r;   	enumerater   r   r<   )r   pagesr   rF   rG   rB   rH   rI   r   r?   r@   rA   rC   rD   s                 r   test_batch_all_goodr   =  s   
 q!  	-aS0MaSPcd-aS1	3E 
 E"Gw<1<1<133ww<1wa0 +	6&"233333333z333z333333&333&333333"2333"23333333333||*qc{*|{****|{******v***v***|***{*******+s   Mc                  r   t        dd      } t        ddd      }t        dd	      }t        | ||g      }t        |      }d
}||k(  }|st        j                  d|fd||f      dt        j                         v st        j                  t              rt        j                  t              nddt        j                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      dz  }dd|iz  }t        t        j                  |            dx}x}}|d   }	t        |	t              }|sddt        j                         v st        j                  t              rt        j                  t              ndt        j                  |	      dt        j                         v st        j                  t              rt        j                  t              ndt        j                  |      dz  }
t        t        j                  |
            dx}	}|d   }d}||u }|slt        j                  d|fd||f      t        j                  |      t        j                  |      dz  }dd|iz  }t        t        j                  |            dx}x}}|d   }	t        |	t              }|sddt        j                         v st        j                  t              rt        j                  t              ndt        j                  |	      dt        j                         v st        j                  t              rt        j                  t              ndt        j                  |      dz  }
t        t        j                  |
            dx}	}y)zIA page with non-HTML content_type returns None; good pages still succeed.zD<html><head><title>Good</title></head><body><p>OK.</p></body></html>zhttps://example.com/goodr   znot htmlzhttps://example.com/pdfzapplication/pdf)r   r   zN<html><head><title>Also Good</title></head><body><p>Also OK.</p></body></html>zhttps://example.com/also-goodr_   r%   r`   ra   r   rc   r3   r4   Nr   z5assert %(py5)s
{%(py5)s = %(py0)s(%(py2)s, %(py3)s)
}r   r   )r!   r#   r0   r)   rf   )is)z%(py1)s is %(py4)srd   re   r2   rg   )r   r   ra   r7   r=   r5   r6   r8   r9   r:   r;   r   r   )	good_pagenon_html_pageanother_goodr   rF   rG   rB   rH   rI   rA   rC   rE   r?   r@   s                 r   test_batch_with_failurer   L  s&   N&I %&M
 X+L
 Y|DEGw<1<1<133ww<1aj3:j"233333333:333:333j333333"2333"233333333331::::aj3:j"233333333:333:333j333333"2333"23333333333r   c                  f   t        g       } g }| |k(  }|st        j                  d|fd| |f      dt        j                         v st        j
                  |       rt        j                  |       ndt        j                  |      dz  }dd|iz  }t        t        j                  |            dx}}y)z.An empty list of pages produces an empty list.r%   rl   r   rm   rn   r)   N)	r   r7   r=   r5   r6   r8   r9   r:   r;   )r   rF   rA   ro   rC   s        r   test_batch_emptyr   d  sd    BG7b=7b77br   )zhttps://example.com/pagez	text/html)*__doc__builtinsr5   _pytest.assertion.rewrite	assertionrewriter7   pytestbs4r   core.kb.contractsr   r   core.kb.extractorr   r   r	   r
   r   r   strr   r   rJ   rR   rU   rY   r[   r]   rj   rp   rs   r|   r~   r   r   r   r   r   r   r   r   r   r   r   <module>r      s       ;  &@#.	S 	s 	 	3>	' ' '")(	(	,3,E
("
%'(@4=+40r   