
    /is                      d dl mZ d dlZd dlZd dlZd dlmZmZ d dlm	Z	m
Z
mZmZmZ d dlmZmZmZ d dlZd dlmZmZ d dlmZ d dlmZ d d	lmZmZmZ d dlZd dlZd
dlm Z  d
dl!m"Z" d
dl#m$Z$ d
dl%m&Z&m'Z'm(Z( d
dl)m*Z* d
dl+m,Z, d
dl-m.Z. d
dl/m0Z0 d
dl1m2Z2m3Z3m4Z4 d dl5Z5d dl6Z6d dl7Z7d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>Z>d dl?m@Z@  G d de      ZA G d deA      ZB G d deC      ZD G d deD      ZE G d  d!eD      ZF G d" d#eA      ZGy)$    )annotationsN)ABCabstractmethod)CallableDictAnyListUnion)OptionalAsyncGeneratorFinal)PageError)TimeoutError)BytesIO)Image	ImageDraw	ImageFont   )load_js_scriptAsyncCrawlResponse)SCREENSHOT_HEIGHT_TRESHOLD)BrowserConfigCrawlerRunConfigHTTPCrawlerConfig)AsyncLogger)SSLCertificate)ValidUAGenerator)BrowserManager)BrowserAdapterPlaywrightAdapterUndetectedAdapter)ClientTimeout)urlparse)MappingProxyType)partialc                  "    e Zd ZdZedd       Zy)AsyncCrawlerStrategyze
    Abstract base class for crawler strategies.
    Subclasses must implement the crawl method.
    c                   K   y wN )selfurlkwargss      [/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/async_crawler_strategy.pycrawlzAsyncCrawlerStrategy.crawl)   s	     s   N)r.   strreturnr   )__name__
__module____qualname____doc__r   r1   r,       r0   r)   r)   #   s    
  r8   r)   c                     e Zd ZdZ	 d'	 	 	 	 	 d(dZd Zd Zd Zd Zd)dZ	d*d	Z
d+d
Zd,dZd-dZd.d/dZ	 d.	 	 	 	 	 d0dZd Zd1dZ	 	 	 	 	 	 d2dZ	 	 	 	 	 	 d2dZd3d4dZd5dZd Zd6dZd7dZd8dZ	 	 	 	 	 	 d9dZ	 d:	 	 	 	 	 d;dZd1dZd<dZd=dZd=dZd:d>dZ 	 	 	 	 	 	 d?d Z!	 	 	 	 	 	 d?d!Z"d" Z#d@dAd#Z$dBd$Z%dCd%Z&dDd&Z'y)EAsyncPlaywrightCrawlerStrategya  
    Crawler strategy using Playwright.

    Attributes:
        browser_config (BrowserConfig): Configuration object containing browser settings.
        logger (AsyncLogger): Logger instance for recording events and errors.
        _downloaded_files (List[str]): List of downloaded file paths.
        hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
        browser_manager (BrowserManager): Manager for browser creation and management.

        Methods:
            __init__(self, browser_config=None, logger=None, **kwargs):
                Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
            __aenter__(self):
                Start the browser and initialize the browser manager.
            __aexit__(self, exc_type, exc_val, exc_tb):
                Close the browser and clean up resources.
            start(self):
                Start the browser and initialize the browser manager.
            close(self):
                Close the browser and clean up resources.
            kill_session(self, session_id):
                Kill a browser session and clean up resources.
            crawl(self, url, **kwargs):
                Run the crawler for a single URL.

    Nc           
     "   |xs t        j                  |      | _        || _        |xs
 t	               | _        g | _        dddddddddd	| _        t        | j                  | j                  t        | j
                  t                    | _        y)a  
        Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.

        Args:
            browser_config (BrowserConfig): Configuration object containing browser settings.
                                          If None, will be created from kwargs for backwards compatibility.
            logger: Logger instance for recording events and errors.
            browser_adapter (BrowserAdapter): Browser adapter for handling browser-specific operations.
                                           If None, defaults to PlaywrightAdapter.
            **kwargs: Additional arguments for backwards compatibility and extending functionality.
        N)	on_browser_createdon_page_context_createdon_user_agent_updatedon_execution_startedon_execution_endedbefore_goto
after_gotobefore_return_htmlbefore_retrieve_html)browser_configloggeruse_undetected)r   from_kwargsrE   rF   r"   adapter_downloaded_fileshooksr    
isinstancer#   browser_manager)r-   rE   rF   browser_adapterr/   s        r0   __init__z'AsyncPlaywrightCrawlerStrategy.__init__J   s     -Q0I0I&0Q '=*;*= "$ #''+%)$("&"&$(


  ...;;%dll4EF 
r8   c                B   K   | j                          d {    | S 7 wr+   startr-   s    r0   
__aenter__z)AsyncPlaywrightCrawlerStrategy.__aenter__v        jjl 	   c                @   K   | j                          d {    y 7 wr+   closer-   exc_typeexc_valexc_tbs       r0   	__aexit__z(AsyncPlaywrightCrawlerStrategy.__aexit__z        jjl   c                   K   | j                   j                          d{    | j                  d| j                   j                  | j                   j                         d{    y7 I7 w)zG
        Start the browser and initialize the browser manager.
        Nr<   )context)rM   rR   execute_hookbrowserdefault_contextrS   s    r0   rR   z$AsyncPlaywrightCrawlerStrategy.start}   si      ""((***   ((((88   
 	
 	
 	+	
s"   A.A*AA.$A,%A.,A.c                j   K   | j                   j                          d{    dt        _        y7 w)z;
        Close the browser and clean up resources.
        N)rM   rY   r    _playwright_instancerS   s    r0   rY   z$AsyncPlaywrightCrawlerStrategy.close   s-      ""((***.2+ 	+s   313c                   K   | j                   j                  dd       | j                  j                  |       d{    y7 w)z
        Kill a browser session and clean up resources.

        Args:
            session_id (str): The ID of the session to kill.

        Returns:
            None
        zSSession auto-kill is enabled in the new version. No need to manually kill sessions.WARNINGmessagetagN)rF   warningrM   kill_session)r-   
session_ids     r0   rn   z+AsyncPlaywrightCrawlerStrategy.kill_session   sA      	i 	 	
 ""//
;;;s   <AAAc                Z    || j                   v r|| j                   |<   yt        d|       )a  
        Set a hook function for a specific hook type. Following are list of hook types:
        - on_browser_created: Called when a new browser instance is created.
        - on_page_context_created: Called when a new page context is created.
        - on_user_agent_updated: Called when the user agent is updated.
        - on_execution_started: Called when the execution starts.
        - before_goto: Called before a goto operation.
        - after_goto: Called after a goto operation.
        - before_return_html: Called before returning HTML content.
        - before_retrieve_html: Called before retrieving HTML content.

        All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs.

        Args:
            hook_type (str): The type of the hook.
            hook (Callable): The hook function to set.

        Returns:
            None
        Invalid hook type: N)rK   
ValueError)r-   	hook_typehooks      r0   set_hookz'AsyncPlaywrightCrawlerStrategy.set_hook   s1    * 

"$(DJJy!29+>??r8   c                   K   | j                   j                  |      }|r-t        j                  |      r ||i | d{   S  ||i |S |r|d   S dS 7 w)a4  
        Execute a hook function for a specific hook type.

        Args:
            hook_type (str): The type of the hook.
            *args: Variable length positional arguments.
            **kwargs: Keyword arguments.

        Returns:
            The return value of the hook function, if any.
        Nr   )rK   getasyncioiscoroutinefunction)r-   rs   argsr/   rt   s        r0   rc   z+AsyncPlaywrightCrawlerStrategy.execute_hook   sd      zz~~i(**40!426222T,V,,tAw(D( 3s   >A AAc                    || _         y)z
        Update the user agent for the browser.

        Args:
            user_agent (str): The new user agent string.

        Returns:
            None
        N)
user_agent)r-   r|   s     r0   update_user_agentz0AsyncPlaywrightCrawlerStrategy.update_user_agent   s     %r8   c                    || _         y)z
        Set custom headers for the browser.

        Args:
            headers (Dict[str, str]): A dictionary of headers to set.

        Returns:
            None
        N)headers)r-   r   s     r0   set_custom_headersz1AsyncPlaywrightCrawlerStrategy.set_custom_headers   s     r8   c                P  K   |j                         }|j                  d      r.|dd j                         }| j                  |||       d{   S |j                  d      r0|dd j                         }	 |j                  ||       d{    y|j                  d      s|j                  d      r| j                  |||       d{   S 	 |j                  ||       d{    y7 7 a# t        $ r3}dt        |      v rt        d| d	| d
      t        d| d
      d}~ww xY w7 d7 J# t        $ rf}dt        |      v rt        d| d	| d
      	 | j                  |d| d|       d{  7  cY d}~S # t        $ r t        d| d      w xY wd}~ww xY ww)a  
        Wait for a condition in a smart way. This functions works as below:

        1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true.
        2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present.
        3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true.
        4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present.

        This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().
        Args:
            page: Playwright page object
            wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.
            timeout (float): Maximum time to wait in milliseconds

        Returns:
            None
        zjs:   Nzcss:   timeoutTimeoutzTimeout after zms waiting for selector ''zInvalid CSS selector: 'z()functionz() => {}zInvalid wait_for parameter: 'zp'. It should be either a valid CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.)strip
startswithcsp_compliant_waitwait_for_selectorr   r2   r   rr   )r-   pagewait_forr   js_codecss_selectores          r0   
smart_waitz)AsyncPlaywrightCrawlerStrategy.smart_wait   s    $ >>#u%qrl((*G00wHHH  (#AB<--/LP,,\7,KKK ""4(H,?,?
,K!44T8WMMM0070KKK+ I
 L PA&&(	1J<.XYZ  %'>|nA%NOOP N L  CF**,WI5NxjXYZ 	)-)@)@ $
"&=w* $ $   % ","?z JO !O# s   AF&C-(F&7C1 C/C1 9F&D0F&D4 'D2(D4 ,F&/C1 1	D-:.D((D--F&2D4 4	F#=FF7E:8F<F#=F&FFF##F&c                   K   d| d| d}	 | j                   j                  ||       d{   }|S 7 # t        $ r.}dt        |      v rt	        dt        |             Y d}~yd}~ww xY ww)a  
        Wait for a condition in a CSP-compliant way.

        Args:
            page: Playwright page object
            user_wait_function: JavaScript function as string that returns boolean
            timeout: Maximum time to wait in milliseconds

        Returns:
            bool: True if condition was met, False if timed out

        Raises:
            RuntimeError: If there's an error evaluating the condition
        z8
        async () => {
            const userFunction = z;
            const startTime = Date.now();
            try {
                while (true) {
                    if (await userFunction()) {
                        return true;
                    }
                    if (Date.now() - startTime > aR  ) {
                        return false;  // Return false instead of throwing
                    }
                    await new Promise(resolve => setTimeout(resolve, 100));
                }
            } catch (error) {
                throw new Error(`Error evaluating condition: ${error.message}`);
            }
        }
        NzError evaluating conditionz#Failed to evaluate wait condition: F)rI   evaluate	Exceptionr2   RuntimeError)r-   r   user_wait_functionr   
wrapper_jsresultr   s          r0   r   z1AsyncPlaywrightCrawlerStrategy.csp_compliant_wait'  s     """4!5 62 3: 	;	
(	<<00zBBFM C 	+s1v5"%HQ#QRR		s6   
A/5 35 A/5 	A,$A'"A/'A,,A/c                  K   |j                  d       d{   }t        |      D ]  \  }}	 |j                  d| d       d{    |j                          d{   }|rz|j	                  dd       d{    |j                  d       d{   }d	| }|j                  d
d      }| j                  j                  |d| d| d| d       d{    n | j                  j                  ddd|i        |S 7 7 7 7 7 v7 3# t        $ r5}	| j                  j                  dd|t        |	      d       Y d}	~	!d}	~	ww xY ww)a  
        Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content.

        Args:
            page: Playwright page object

        Returns:
            Playwright page object
        iframeNz"(element) => element.id = "iframe-"load0u  r   z() => document.body.innerHTMLzextracted-iframe-content-`z\`zl
                        () => {
                            const iframe = document.getElementById('iframe-zx');
                            const div = document.createElement('div');
                            div.innerHTML = `z0`;
                            div.className = 'zf';
                            iframe.replaceWith(div);
                        }
                    z1Could not access content frame for iframe {index}SCRAPEindexrk   rl   paramsz(Error processing iframe {index}: {error}ERROR)r   error)query_selector_all	enumerater   content_framewait_for_load_statereplacerI   rF   rm   r   r   r2   )
r-   r   iframesir   frameiframe_content
class_name_iframer   s
             r0   process_iframesz.AsyncPlaywrightCrawlerStrategy.process_iframesU  s     //99"7+ .	IAv-oo(J1#Q&OPPP %224433 4   
 ,1>>7, &N
 $=QC!@J -44S%@G,,//LLM3 O..5Y 7..8\ :
 
 
 KK'' S$ '| ( I.	b g :
 Q 5
&
"  !!F%&Q8 "  s   EDEDDDDD;D	<DDADD%D?EDD	DDD	E*EEEEc                B  K   | j                          d{    |j                  d      xs t        t        j                               }|j                  d| j
                        }| j                  j                  t        d||d|       d{   \  }}|S 7 7 w)au  
        Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.
        This function is asynchronous and returns a string representing the session ID.

        Args:
            **kwargs: Optional keyword arguments to configure the session.

        Returns:
            str: The session ID.
        Nro   r|   )ro   r|   r,   )	rR   rw   r2   uuiduuid4r|   rM   get_pager   )r-   r/   ro   r|   r   rb   s         r0   create_sessionz-AsyncPlaywrightCrawlerStrategy.create_session  s      jjlZZ-BTZZ\1B
ZZdoo>
"22;;<L =
!!=
 =
  g
  	s"   BBA:BB
BBc           	       K   |xs t        j                  |      }i }d}d}|j                  d      r| j                  ||       d{   S |j                  d      s#|j                  d      s|j                  d      rh|j                  xs |j
                  xs |j                  xs |j                  xs| |j                  xsn |j                  xs` |j                  xsR |j                  xsD |j                  xs6 |j                  xs( |j                  xs |j                  xs |j                   }|r| j                  ||       d{   S |j                  d      rZ|dd }t"        j$                  j'                  |      st)        d|       t+        |d	d
      5 }	|	j-                         }
ddd       n|j                  d      r|dd n|dd }
t/        
||dddd      S t1        d      7 7 # 1 sw Y   +xY ww)a  
        Crawls a given URL or processes raw HTML/local file content based on the URL prefix.

        Args:
            url (str): The URL to crawl. Supported prefixes:
                - 'http://' or 'https://': Web URL to crawl.
                - 'file://': Local file path to process.
                - 'raw://': Raw HTML content to process.
            **kwargs: Additional parameters:
                - 'screenshot' (bool): Whether to take a screenshot.
                - ... [other existing parameters]

        Returns:
            AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
           N)http://zhttps://zview-source:file://raw://raw:   Local file not found: rutf-8encoding   r   )htmlresponse_headersstatus_code
screenshotpdf_data
mhtml_dataget_delayed_contentz?URL must start with 'http://', 'https://', 'file://', or 'raw:')r   rH   r   
_crawl_webprocess_in_browserr   pdfcapture_mhtmlr   r   scan_full_pageremove_overlay_elementssimulate_usermagicr   capture_console_messagescapture_network_requestsospathexistsFileNotFoundErroropenreadr   rr   )r-   r.   configr/   r   r   screenshot_dataneeds_browserlocal_file_pathfr   s              r0   r1   z$AsyncPlaywrightCrawlerStrategy.crawl  s!    $ ?+77?>>ABf555^^I&#..*BcnnU[F\ )) 0!!0

0 $$0 	0
 0 %%0 ..0 $$0 0 &&0 //0 //    "__S&999 ~~i("%ab'ww~~o6+.D_DU,VWW/3A $Q668D$ $ #&..":s12wAB%!1'$(  Q _ 6. :$ $s?   AHG4	DHG7AH"G93AH7H9H>Hc                "   56K   |_         i }d}d}}g  _        g 5g }|j                  }|r| j                  _        nR|j                  s|j
                  dk(  r7 t               j                  dhi |j                  xs i  j                  _         j                  j                  |       d{   \  6}	|j                  s|j                  s|j                  r"|	j                  t        d             d{     j                  d6|	|       d{    |j                   rH5 fd}
5 fd}5 fd	}6j#                  d
|
       6j#                  d|       6j#                  d|       d}d}|j$                  rH j&                  j)                  6|       d{   } j&                  j+                  6|       d{   }	 d}|j,                  rt/        j0                        } j                  j2                  r6j#                  d fd       |j4                  s\ j                  d6|	|       d{    j7                  d      xs$ j7                  d      xs j7                  d      }|rÉj7                  d      rZdd }t8        j:                  j=                  |      st?        d|       tA        |dd      5 }|jC                         }ddd       nj7                  d      rdd ndd }6jE                  |jF                         d{    d}|jH                  xs }d}i }n"	 |jJ                  jM                  dd      rUtO        jP                  t9        jR                  d            jU                         }6jW                  d d!| d"i       d{    6jY                  |jF                  |jZ                  #       d{   }6j                   }|d}i }nm|}|jf                  }|rE|jh                  r9|jh                  }|jk                          d{   }|r|}|}|r|jh                  r9|jl                  }|jn                  } j                  d*6|	||+       d{    nd}i }	 6jq                  d,d-d./       d{     js                  6d0d.1       d{   }|s3|jt                  s' jw                  6       d{   }t]        d2|        j                  j|                  s|j~                  s|j                  r}6j                  d6       d{    t        j                  d7       d{     js                  6d8d91       d{   }|s) j`                  r j`                  j                  d:d;<        j                  j|                  s|j                  r	  j                  6       d{   }|d=   }|d>   }  j                  j                  }!t        |!| z  |z  d?z        }"6j                  |!|"d@       d{    t        |!| z  |"|z        }#6j                  j                  6       d{   }$|$j                  dA| |dBd|#dC       d{    |j                  r/ j                  6|j                  |j                         d{    |j                  r$ j                  6|j                         d{    |j                  r j                  6|j                         d{   }|dG   s/ j`                  j                  dHdIdF|jM                  dF      i(        j                  dJ6|	|       d{     j                  dK6|	||L       d{    |j                  s|j                  r6j                  j                  dMdM       d{    6j                  j                          d{    6j                  j                          d{    6j                  j                  dN       d{    |j                  s|j                  r	 |j                  rK	 |j                  |j                  n|jZ                  }% j                  6|j                  |%1       d{     j                  j|                  sLt        dP      }&	 	 6j                  d6dQ1       d{     j&                  j                  6|&       d{    |j                  r j                  6       d{   6 j                  dT6|	|       d{    |j                  r't        j                  |j                         d{    |j                  r j                  6       d{    |j                  r	 |j                  j                  dU      D 'cg c]  }'|'j                          }(}'g })|(D ]<  }*	  j&                  j                  6dV|* dW       d{   }+|)j                  |+       > dZd[j                  |)      z   d\z   },n6j                          d{   }, j                  d^6|,|	|_       d{    t        j                         }-d}.d}/d}0|j                  r j                  6       d{   }.|j                  r j                  6       d{   }0|j                  rX|j                  r't        j                  |j                         d{     j                  6|j                  `       d{   }/|/s|.s|0r5 j`                  jc                  dadbdct        j                         |-z
  i(       didj6 fdd}1|j$                  rJt         j&                  de      r4 j&                  j                  6       d{   }2|j                  |2       6j                   }t        |,||||/|.|0|1| j                  r j                  nd||j                   r5nd|j$                  r|ndf      6j                  j                  j                  }3t        dg |3D              }4|j                  rS |4dBk  r- j                  j                  s j                  j                  rS |j                   r66j                  d

       6j                  d       6j                  d       |j$                  rpt         j&                  de      r4 j&                  j                  6       d{   }2|j                  |2        j&                  j                  6||       d{    6j                          d{    S 7 7 v7 \7 7 7 ?# 1 sw Y   
xY w7 
k7 	7 	# t\        $ rk}d$t_        |      v r< j                  j2                  r& j`                  jc                  d% d&d'i(       d}nte        d)t_        |             Y d}~
d}~ww xY w7 	7 	7 	v7 	]7 	9# t\        $ rn  jw                  6       d{  7  } j                  jx                  r  j`                  j{                  d3d4d5|i(       |jt                  st]        d2|       Y 	w xY w7 	a7 	G7 	.7 7 }7 K7 .# t        $ r4} j`                  j                  dDdEdFt_        |      i(       Y d}~dd}~ww xY w7 67 	7 7 7 s7 :7 7 7 7 o# t        $ r}te        dOt_        |             d}~ww xY w7 \# t        $ r Y fw xY w7 K# t        $ r4} j`                  j                  dRdSdFt_        |      i(       Y d}~d}~ww xY w7 i7 O7 7 c c}'w 7 # t\        $ r%}t        dX|* dYt_        |              Y d}~d}~ww xY w# t\        $ r}te        d]t_        |             d}~ww xY w7 7 7 T7 27 7 7 N7 7 7 # t        $ r}|d}~ww xY w# 6j                  j                  j                  }3t        dg |3D              }4|j                  rw |4dBk  r- j                  j                  s j                  j                  rw |j                   r66j                  d

       6j                  d       6j                  d       |j$                  rrt         j&                  de      r5 j&                  j                  6       d{  7  }2|j                  |2        j&                  j                  6||       d{  7   6j                          d{  7   w xY ww)ka  
        Internal method to crawl web URLs with the specified configuration.
        Includes optional network and console capturing.

        Args:
            url (str): The web URL to crawl
            config (CrawlerRunConfig): Configuration object controlling the crawl behavior

        Returns:
            AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data
        NrandomcrawlerRunConfignavigator_overriderr=   )rb   r   c                  K   	 d }	 | j                   }|r	 |j                  dd      }j                  d| j                  | j                  t        | j                        || j                  | j                         t        j                         d       y # t        $ r dt        |       d}Y w xY w# t        $ r d}Y w xY w# t        $ r~}j                  r-j                  j                  d	| j                   d
| d       j                  d| j                  t        |      t        j                         d       Y d }~y d }~ww xY ww)Nr   r   errorsz[Binary data: z bytes]z[Error retrieving post data]request)
event_typer.   methodr   	post_dataresource_typeis_navigation_request	timestampz$Error capturing request details for : CAPTURErl   request_capture_errorr   r.   r   r   )post_data_bufferdecodeUnicodeDecodeErrorlenr   appendr.   r   dictr   r   r   timerF   rm   r2   )r   post_data_strr   r   captured_requestsr-   s       r0   handle_request_capturezIAsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.handle_request_capture/  sg    U$(M
G$+$<$<	$Z1:1A1A'R[1A1\ &,,&/&{{")..#'#8%2)0)>)>181N1N1P%)YY[	. 	 %7 Z3A#i.AQQX1YZ$ G(FG ! U{{++.RSZS^S^R__abcad,ekt+u%,,<S\c\g\gruvwrx  HL  HQ  HQ  HS  .T  U  UUso   EC B4 B A+C EB1.B4 0B11B4 4C?C CC 	EA4EEEEc                  K   	 	 | j                          d {   }j                  d| j                  | j                  | j
                  t        | j                        | j                  | j                  j                  t        j                         did	       y 7 # t        $ r}d }Y d }~d }~ww xY w# t        $ r~}j                  r-j                  j                  d| j                   d| d       j                  d| j                  t        |      t        j                         d	       Y d }~y d }~ww xY ww)
Nresponsetext)	r   r.   statusstatus_textr   from_service_workerrequest_timingr   bodyz%Error capturing response details for r   r   r   response_capture_errorr   )r  r   r   r.   r  r  r   r   r  r   timingr   rF   rm   r2   )r   	text_bodyr   r  r   r-   s       r0   handle_response_capturezJAsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.handle_response_captureM  s<    W$ +3--/$9	
 &,,&0'||"*//'/';';#'(8(8#9/7/K/K*2*:*:*A*A%)YY[ #I".  %:$ $#$& ! W{{++.ST\T`T`Saacdecf,gmv+w%,,<T]e]i]itwxytz  JN  JS  JS  JU  .V  W  WWsc   EB BB A>B7 EB 	B4(B/*B7 /B44B7 7	D> A4D94E9D>>Ec           	       K   	 j                  d| j                  | j                  | j                  | j                  rt        | j                        ndt        j                         d       y # t        $ r~}j                  r-j                  j                  d| j                   d| d       j                  d| j                  t        |      t        j                         d	       Y d }~y d }~ww xY ww)
Nrequest_failedzUnknown failure)r   r.   r   r   failure_textr   z+Error capturing request failed details for r   r   r   request_failed_capture_errorr   )
r   r.   r   r   failurer2   r   r   rF   rm   )r   r   r   r-   s     r0   handle_request_failed_capturezPAsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.handle_request_failed_capturek  s     \%,,&6&{{")..)0)>)>@GGOO(<Uf%)YY[.  " \{{++.YZaZeZeYffhijhk,lr{+|%,,<Zcjcncny|}~y  OS  OX  OX  OZ  .[  \  \\s0   C9A)A/ .C9/	C68A4C1,C91C66C9r   r   requestfaileddownloadc                L    t        j                  j                  |             S r+   )rx   create_task_handle_download)r  r-   s    r0   <lambda>z;AsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.<lambda>  s     W%8%8--h7& r8   rA   )rb   r.   r   r   r   r   r   r   r   r   r   r   r   
wait_untilr   use_csp_nonceF    zContent-Security-Policyz-default-src 'self'; script-src 'self' 'nonce-z' 'strict-dynamic')r  r   znet::ERR_ABORTEDz1Navigation aborted, likely due to file download: GOTOr.   r   zFailed on navigating ACS-GOTO:
rB   )rb   r.   r   r   r  attachedr   )stater   a  () => {
                        const element = document.body;
                        if (!element) return false;
                        const style = window.getComputedStyle(element);
                        const isVisible = style.display !== 'none' && 
                                        style.visibility !== 'hidden' && 
                                        style.opacity !== '0';
                        return isVisible;
                    }r   zBody element is hidden: zBody visibility info: {info}DEBUGinfodomcontentloaded皙?zQ() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)  z)Some images failed to load within timeoutr   rj   heightwidthgffffff?r$  r#  z"Emulation.setDeviceMetricsOverrider   )r$  r#  deviceScaleFactormobilescalez-Failed to adjust viewport to content: {error}VIEWPORTr   successz)User script execution had issues: {error}JS_EXECr?   r@   )rb   r   r   d   	ArrowDownzWait condition failed: update_image_dimensions   z(Error updating image dimensions: {error}r   rD   ,z&Array.from(document.querySelectorAll("zn"))
                                    .map(el => el.outerHTML)
                                    .join('')z-Warning: Could not get content for selector 'z': z<div class='crawl4ai-result'>

z
</div>z Failed to extract HTML content: rC   )r   r   rb   r   screenshot_height_thresholdz;Exporting media (PDF/MHTML/screenshot) took {duration:.2f}sEXPORTdurationc                   K   j                   j                  dd| d       t        j                  |        d {    j	                          d {   S 7 7 w)Nz?Waiting for {delay} seconds before retrieving content for {url}INFO)delayr.   r   )rF   r  rx   sleepcontent)r8  r   r-   r.   s    r0   r   zFAsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.get_delayed_content  sX       ]%*37 ! 
 mmE***!\\^++ ++s!   :AAAAAAretrieve_console_messages)r   r   js_execution_resultr   r   r   r   r   ssl_certificatedownloaded_filesredirected_urlnetwork_requestsconsole_messagesc              3  F   K   | ]  }t        |j                          y wr+   )r   pages).0rb   s     r0   	<genexpr>z<AsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.<genexpr>2  s     MWc'--0Ms   !r,   )      @)r8  floatr3   r2   )r.   rJ   r|   rE   r   user_agent_moder   generateuser_agent_generator_configrM   r   override_navigatorr   add_init_scriptr   rc   r   onr   rI   setup_console_capturesetup_error_capturefetch_ssl_certificater   from_urlaccept_downloadsjs_onlyr   r   r   r   r   r   r   set_contentr  base_urlexperimentalrw   hashlibsha256urandom	hexdigestset_extra_http_headersgotopage_timeoutr   r2   rF   r  r   r   redirected_fromr   r  r   r   r   ignore_body_visibilitycheck_visibilityverbosedebug	text_modewait_for_imagesadjust_viewport_to_contentr   rx   r9  rm   get_page_dimensionsviewport_widthintset_viewport_sizeminrb   new_cdp_sessionsendr   r   _handle_full_page_scanscroll_delaymax_scroll_stepsvirtual_scroll_config_handle_virtual_scrollr   robust_execute_user_scriptmousemovedownupkeyboardpressr   r   wait_for_timeoutr   PlaywrightTimeoutErrorr   r   r   delay_before_return_htmlr   splitr   r   printjoinr:  r   perf_counterr   
export_pdfr   r   screenshot_wait_fortake_screenshotr3  hasattrr;  extendr   rd   contextssumro   use_managed_browserheadlessremove_listenercleanup_console_capturerY   )7r-   r.   r   r   execution_resultr   r?  captured_consoleuser_agent_to_overriderb   r   r
  r  handle_consolehandle_errorssl_certis_local_contentr   r   html_contentr   noncer   
first_respreqprev_req	prev_resp
is_visiblevisibility_infoimages_loaded
dimensionspage_height
page_widthtarget_widthtarget_heightr(  cdpr   update_image_dimensions_jss	selectors
html_partsselectorr:  r   start_export_timer   r   r   r   final_messagesall_contextstotal_pagesr   r   s7   ``                                                   @@r0   r   z)AsyncPlaywrightCrawlerStrategy._crawl_web  s     
 "$  "(!2!2!-CD*\\V33x?-H-=-?-H-H .55;.D*
 #22;;V;TTg $$(<(<)).9N*OPPP  94Y_``` **U<W<\ GGI56GGJ 78GGO%BC **#'<<#E#EdL\#]]N!%!A!A$HX!YYL}	#H++)2237 ""33 >>''tWRU^d'eee $'>>)#<#rx@X#r\_\j\jkq\r #~~i0*-ab'!ww~~o>"36L_L]4^"__!/3I 4Q+,668L4 4 36..2Js12wPSTUTVPW**<FDUDU*VVV#H%+__%;N"%K')$\!..22?EJ$+NN2::b>$B$L$L$NE #'"="=$=Anotnu  vH  @I!"#   *.F,=,=vGZGZ *3 * $ *.(  '&)+-(%-
&..!c&9&9'*':':H.6.?.?.A(AI(-6
"*C "c&9&9 '1&7&7+5+=+=('' $S8\b (   
 "#% !N,,V:u,UUU $(#:#: " $; $ 
 "&*G*G,0,A,A$,G&GO":?:K LMMt &&00&&&*K*K../ABBBmmC((( '+&=&=g  '> ' ! %KK'' K$ (  &&00V5V5V"'+'?'?'E!EJ",X"6K!+G!4J $(#6#6#E#EL$'z(AK(ORV(V$WM00".-H    z 9=;;VWE $ < <T BBC((<%/&112&+%*	 	 	$ $$11$8K8KVMdMdeee ++11$8T8TUUU ~~)-)H)H&..* $  (	2KK'' K% ')9)=)=g)FG (  ''(>g^d'eee''(<dG\bk{'||| ##v||jjooc3///jjoo'''jjmmo%%mm))+666 ??v':':u K9?9P9P9\f55bhbubuG//foow *    &&00-;<U-V*"667IST6UUU ,,//6PQQQ %%!11$77 ##$:D'Z`#aaa..mmF$C$CDDD --224888""T4:4G4G4M4Mc4R Sq SI S!#J$- 	ii,0LL,A,A$$J8* U. !1- 'G
 '--g6	i =		*@UUXbbD "\\^+ ##$4dGTZ $   
 !% 1 1 3H"OJzz!%!66###'#5#5d#;;
  --!--(B(BCCC(,(<(<f6X6X )= ) # (j  Y &(9(9(;>O(OP ! , , ..74<<Id3e'+||'M'Md'S!S ''7 "XXN &!1$4'*!%$7 (.2.D.DD**$-6<6U6U!2[_5;5T5T!1Z^!0  <<//88LMMMK  !t':':'N'NRVReReRnRn 22((4JK((5LM((:WX22t||-HI/3||/U/UVZ/[)[(//? ,,>>t^Uabbb jjl""U U Q 	ap ^Y. f4 4 W$ ! \ .Q7D<O<O<`<` KK,,*[\_[`(a$*(-s| - 
 (,H".1QRUVWRXQY/Z"[[ %\6 )B V 'H  N(,(=(=d(C"C"C&&..KK%% ># &8 &  44":?:K LMM 5Nv C(! "F
 C	 ! KK'' O& 'Q0 (   f V$ f| 0'%6 ! K&)@Q'IJJK V1 Q  KK%% J# 'Q0 &   8 bD 9
 !T
'  % i!$QRZQ[[^_bcd_e^f"ghhi
  T&)I#a&'RSST , 7 < D#. "Tb *\ c #7  	G	
  <<//88LMMMK  !t':':'N'NRVReReRnRn 22((4JK((5LM((:WX22t||-HI/3||/U/UVZ/[)[)[(//? ,,>>t^Uabbb jjl""s  B,AE52u.3AAE5;u1<AE5u4BAE5 u7!#AE5u:AE5
A4A@ >u=?BA@ v AA@ "v#A@  A,v ,v-/v vv -AA@ /x0A@ 	2A@ ;x<	A@ x xx 9x:&x  x!x 3AA@ 5z6A@ zA@ .z/AA@ z+ zAz+ %z"&4z+ z%z+ :z(;z+ ?6A@ 5{+6/A@ %{.&/A@ {1AA@ &{4'A@ {7;A@  {:!A@ "{=#!A@ | "A@ '|()A@ A|	 ||	 !A@ ?|4 |1|4 } 9}:} > A@ ~A@ ;~
<2A@ .~/$A@ ~A@ % ~	 ##~~~ 5A@ 	4
A@ '7(>A@ &:'$A@ =>A@ 
A@ $A@ /A@0BA@ 8A@9A,A@ %C3AE5A@	6AE5A@AE5(A@)AE51AE54AE57AE5:AE5=A@  v
	A@ v v 	x
A x?A@ x

A@ A@ x x x z9x<:AzA@ zA@ A@ A@ z+ "z+ %z+ (z+ +	{(4){#A@ #{((A@ .A@ 1A@ 4A@ 7A@ :A@ =A@  A@ A@ |	 		|.|))|..A@ 1|4 4	}=}  }} 	~)}?9A@ ?~A@ 
A@ A@ A@  ~		$> 		 	1,,11A@ 7A@ :A@ =A@ @ A@ @A@ @A@ @	AE5@AE5@AE5@	A@"@A@@A@"@"A@% @%C4AE2DADD7AE2EAEEAE2E+AE.E,AE2E2AE5c           	     n  K   	 |j                   }|P|j                  | j                  j                  | j                  j                  d       d{    |j                   }|j                  d| j                  j                        }|}| j                  |d||       d{    | j                  |       d{   }|d   }d}	||k  rd||	|k\  rn\t        ||z   |      }| j                  |d||       d{    |	dz  }	| j                  |       d{   }|d   }
|
|kD  r|
}||k  rd| j                  |dd       d{    | j                  |d|       d{    y7 7 7 7 r7 V7 +7 # t        $ r3}| j                  j                  ddd	t        |      i
       Y d}~yd}~ww xY ww)au  
        Helper method to handle full page scanning.

        How it works:
        1. Get the viewport height.
        2. Scroll to the bottom of the page.
        3. Get the total height of the page.
        4. Scroll back to the top of the page.
        5. Scroll to the bottom of the page again.
        6. Continue scrolling until the bottom of the page is reached.

        Args:
            page (Page): The Playwright page object
            scroll_delay (float): The delay between page scrolls
            max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end.

        Nr%  r#  r   )r8  r   z)Failed to perform full page scan: {error}	PAGE_SCANr   r   )viewport_sizeri  rE   rg  viewport_heightrw   safe_scrollrf  rj  r   rF   rm   r2   )r-   r   rn  ro  r  r  current_positionr  total_heightscroll_step_count
new_heightr   s               r0   rm  z5AsyncPlaywrightCrawlerStrategy._handle_full_page_scanJ  s    $:	: ..M$,,"11@@DL_L_LoLop   !% 2 2+//$--==O  / ""4,<L"QQQ
  $77==J%h/L !"\1 $/4EIY4Y#&'7/'I<#X &&tQ0@&UUU "Q&! $(#;#;D#AA
'1
,#-L- #\12 ""4A... ""4L999o R
 > V B / :  	KKCQ(    	s   F5AE6 E'AE6 "E*#E6 ;E,<?E6 ;E.<E6 E0E6 0E6 E2E6 F5!E4"F5'E6 *E6 ,E6 .E6 0E6 2E6 4F56	F2?)F-(F5-F22F5c           	     ~  K   	 ddl m} t        |t              r|j	                  |      }| j
                  j                  ddd|j                  i       d}| j                  j                  |||j                                d{   }|j                  d	d
      rB| j
                  j                  dd|j                  dd      |j                  dd      d       y| j
                  j                  dd       y7 v# t        $ r3}| j
                  j                  dddt        |      i       Y d}~yd}~ww xY ww)a  
        Handle virtual scroll containers (e.g., Twitter-like feeds) by capturing
        content at different scroll positions and merging unique elements.
        
        Following the design:
        1. Get container HTML
        2. Scroll by container height
        3. Wait and check if container HTML changed
        4. Three cases:
           - No change: continue scrolling
           - New items added (appended): continue (items already in page)
           - Items replaced: capture HTML chunk and add to list
        5. After N scrolls, merge chunks if any were captured
        
        Args:
            page: The Playwright page object
            config: Virtual scroll configuration
        r   )VirtualScrollConfigz9Starting virtual scroll capture for container: {selector}VSCROLLr  r   a  
            async (config) => {
                const container = document.querySelector(config.container_selector);
                if (!container) {
                    throw new Error(`Container not found: ${config.container_selector}`);
                }
                
                // List to store HTML chunks when content is replaced
                const htmlChunks = [];
                let previousHTML = container.innerHTML;
                let scrollCount = 0;
                
                // Determine scroll amount
                let scrollAmount;
                if (typeof config.scroll_by === 'number') {
                    scrollAmount = config.scroll_by;
                } else if (config.scroll_by === 'page_height') {
                    scrollAmount = window.innerHeight;
                } else { // container_height
                    scrollAmount = container.offsetHeight;
                }
                
                // Perform scrolling
                while (scrollCount < config.scroll_count) {
                    // Scroll the container
                    container.scrollTop += scrollAmount;
                    
                    // Wait for content to potentially load
                    await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));
                    
                    // Get current HTML
                    const currentHTML = container.innerHTML;
                    
                    // Determine what changed
                    if (currentHTML === previousHTML) {
                        // Case 0: No change - continue scrolling
                        console.log(`Scroll ${scrollCount + 1}: No change in content`);
                    } else if (currentHTML.startsWith(previousHTML)) {
                        // Case 1: New items appended - content already in page
                        console.log(`Scroll ${scrollCount + 1}: New items appended`);
                    } else {
                        // Case 2: Items replaced - capture the previous HTML
                        console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
                        htmlChunks.push(previousHTML);
                    }
                    
                    // Update previous HTML for next iteration
                    previousHTML = currentHTML;
                    scrollCount++;
                    
                    // Check if we've reached the end
                    if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
                        console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
                        // Capture final chunk if content was replaced
                        if (htmlChunks.length > 0) {
                            htmlChunks.push(currentHTML);
                        }
                        break;
                    }
                }
                
                // If we have chunks (case 2 occurred), merge them
                if (htmlChunks.length > 0) {
                    console.log(`Merging ${htmlChunks.length} HTML chunks`);
                    
                    // Parse all chunks to extract unique elements
                    const tempDiv = document.createElement('div');
                    const seenTexts = new Set();
                    const uniqueElements = [];
                    
                    // Process each chunk
                    for (const chunk of htmlChunks) {
                        tempDiv.innerHTML = chunk;
                        const elements = tempDiv.children;
                        
                        for (let i = 0; i < elements.length; i++) {
                            const element = elements[i];
                            // Normalize text for deduplication
                            const normalizedText = element.innerText
                                .toLowerCase()
                                .replace(/[\s\W]/g, ''); // Remove spaces and symbols
                            
                            if (!seenTexts.has(normalizedText)) {
                                seenTexts.add(normalizedText);
                                uniqueElements.push(element.outerHTML);
                            }
                        }
                    }
                    
                    // Replace container content with merged unique elements
                    container.innerHTML = uniqueElements.join('\n');
                    console.log(`Merged ${uniqueElements.length} unique elements from ${htmlChunks.length} chunks`);
                    
                    return {
                        success: true,
                        chunksCount: htmlChunks.length,
                        uniqueCount: uniqueElements.length,
                        replaced: true
                    };
                } else {
                    console.log('No content replacement detected, all content remains in page');
                    return {
                        success: true,
                        chunksCount: 0,
                        uniqueCount: 0,
                        replaced: false
                    };
                }
            }
            NreplacedFzNVirtual scroll completed. Merged {unique} unique elements from {chunks} chunksuniqueCountr   chunksCount)uniquechunkszAVirtual scroll completed. Content was appended, no merging neededrj   z&Virtual scroll capture failed: {error}r   )async_configsr  rL   r   	from_dictrF   r  container_selectorrI   r   to_dictrw   r*  r   r   r2   )r-   r   r   r  virtual_scroll_jsr   r   s          r0   rq  z5AsyncPlaywrightCrawlerStrategy._handle_virtual_scroll  s5    &U	: &$',66v>KKS"F$=$=>  m!`  <<007H&..JZ[[Fzz*e,##l!"(**]A">"(**]A"> $    _! !  \"  	KK@Q(   	sN   D=BC> C<AC> D=C> ;D=<C> >	D:)D50D=5D::D=c           	     \  K   	 |j                   }t        j                  j                  | j                  j
                  |      }| j                  j                  dd||d       t        j                         }|j                  |       d{    t        j                         }| j                  j                  |       | j                  j                  dd||||z
  dd	d
       y7 ]# t        $ r3}| j                  j                  dddt!        |      i       Y d}~yd}~ww xY ww)az  
        Handle file downloads.

        How it works:
        1. Get the suggested filename.
        2. Get the download path.
        3. Log the download.
        4. Start the download.
        5. Save the downloaded file.
        6. Log the completion.

        Args:
            download (Download): The Playwright download object

        Returns:
            None
        z Downloading {filename} to {path}FETCH)filenamer   r   Nz"Downloaded {filename} successfullyCOMPLETEz.2fr  )r  r   r5  z"Failed to handle download: {error}r   r   )suggested_filenamer   r   r~  rE   downloads_pathrF   r  r   r  save_asrJ   r   r*  r   r   r2   )r-   r  r  download_path
start_timeend_timer   s          r0   r  z/AsyncPlaywrightCrawlerStrategy._handle_downloadC  s!    $	!)!<!<GGLL)<)<)K)KM_`MKK:$6N   **,J""=111((*H""))-8KK< 2)#+j#8"=Q ?   	 2  	KK<Q(   	sB   D,B	C- C+AC- *D,+C- -	D)6)D$D,$D))D,c           	     ,  K   t        d      }	 | j                  j                  |d| d       d{    |j                  d       d{    y7 7 # t        $ r3}| j
                  j                  dddt        |      i	       Y d}~yd}~ww xY ww)
z
        Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.

        Args:
            page (Page): The Playwright page instance
        r   zi
                (async () => {
                    try {
                        const removeOverlays = a  ;
                        await removeOverlays();
                        return { success: true };
                    } catch (error) {
                        return {
                            success: false,
                            error: error.toString(),
                            stack: error.stack
                        };
                    }
                })()
            Ni  z*Failed to remove overlay elements: {error}r   r   r   )r   rI   r   ry  r   rF   rm   r2   )r-   r   remove_overlays_jsr   s       r0   r   z6AsyncPlaywrightCrawlerStrategy.remove_overlay_elementst  s      ,,EF	,,''0 1C/C D  " '',,,#" - 	KKDQ(    	sP   B#A AA AA BA A 	B)BBBBc                F   K   |j                  d       d{   }|S 7 w)z
        Exports the current page as a PDF.

        Args:
            page (Page): The Playwright page object

        Returns:
            bytes: The PDF data
        T)print_backgroundN)r   )r-   r   r   s      r0   r  z)AsyncPlaywrightCrawlerStrategy.export_pdf  s%      488 9s   !!c           	       K   	 	 |j                  dd       d{    |j                  dd       d{    |j                  d       d{    |j                  d       d{    |j                  j                  |       d{   }|j                  dddi       d{   }|j                  d      }|j                          d{    |S 7 7 7 7 t# t        $ r?}| j                  r)| j                  j                  dd	d
t        |      i       Y d}~d}~ww xY w7 7 7 ]# t        $ r?}| j                  r)| j                  j                  dd	d
t        |      i       Y d}~yd}~ww xY ww)a  
        Captures the current page as MHTML using CDP.
        
        MHTML (MIME HTML) is a web page archive format that combines the HTML content 
        with its resources (images, CSS, etc.) into a single MIME-encoded file.
        
        Args:
            page (Page): The Playwright page object
            
        Returns:
            Optional[str]: The MHTML content as a string, or None if there was an error
        r     r   Nnetworkidler"  a  
                    () => new Promise(resolve => {
                        // First requestAnimationFrame gets scheduled after the next repaint
                        requestAnimationFrame(() => {
                            // Second requestAnimationFrame gets called after all animations complete
                            requestAnimationFrame(resolve);
                        });
                    })
                z&Wait for load state timed out: {error}MHTMLr   r   zPage.captureSnapshotformatmhtmldataz Failed to capture MHTML: {error})r   ry  r   r   rF   rm   r2   rb   rk  rl  rw   detachr   r   )r-   r   r   cdp_sessionr   mhtml_contents         r0   r   z,AsyncPlaywrightCrawlerStrategy.capture_mhtml  s    1	../A4.PPP..}d.KKK ++D111 mm %   $ !% < <T BBK '++,BXwDWXXF #JJv.M $$&&&  I QK 2  ;;KK'' H# 'Q0 (  C Y '  	{{!!>#SV, " 
 	s   E;C CC CC CC (C)C -D0 D*D0 'D,((D0 D.D0 E;C C C C 	D'(5D"D0 "D''D0 ,D0 .D0 0	E895E3.E;3E88E;c                |    K   g  fd}|j                  d|       |j                  |       d{    S 7 w)z
        Captures console messages from the page.
        Args:

            page (Page): The Playwright page object
        Returns:
            List[Dict[str, Union[str, float]]]: A list of captured console messages
        c                   	 | j                   }| j                  }||t        j                         d}j                  |       y # t        $ r;}j
                  r%j
                  j                  d| d       Y d }~y Y d }~y d }~ww xY w)N)typer  r   z!Error capturing console message: r   r   )r  r  r   r   r   rF   rm   )msgmessage_typemessage_textentryr   r  r-   s        r0   handle_console_messagezXAsyncPlaywrightCrawlerStrategy._capture_console_messages.<locals>.handle_console_message  s    "xx"xx )(!%
 !''. ;;KK'';A3?Y (   s   AA 	B	,BB	consoleN)rM  r\  )r-   r   	file_pathr  r  s   `   @r0   _capture_console_messagesz8AsyncPlaywrightCrawlerStrategy._capture_console_messages  sB      	" 		12ii	""" 	#s   /<:<c                N  K   d}d}d}d}	 |xs
 t               }| j                  j                  |       d{   \  }}|j                  |d       d{    |j                  r| j                  |       d{   }|j                  r| j                  |       d{   }|j                  r[|j                  r't        j                  |j                         d{    t        |dd      }| j                  ||       d{   }|||f|r	 |j                          d{    S S 7 7 7 7 7 U7 /7 # t        $ r Y S w xY w# t        $ r}	dt        |	       }
| j                   j#                  dd	d
|
i       |r|j                  rt%        j&                  ddd      }t)        j*                  |      }t-        j.                         }|j1                  d|
d|       t3               }|j5                  |d       t7        j8                  |j;                               j=                  d      }|||fcY d}	~	|r*	 |j                          d{  7   S # t        $ r Y S w xY wS d}	~	ww xY w# |r*	 |j                          d{  7   w # t        $ r Y w w xY ww xY ww)a  
        Generate media (screenshot, PDF, MHTML) from raw HTML content.

        This method is used for raw: and file:// URLs where we have HTML content
        but need to render it in a browser to generate media outputs.

        Args:
            html (str): The raw HTML content to render
            config (CrawlerRunConfig, optional): Configuration for media options

        Returns:
            tuple: (screenshot_data, pdf_data, mhtml_data) - any can be None
        Nr   r   r  r3  r2  z$Failed to generate media from HTML: z%HTML media generation failed: {error}r   r   r   RGBi   iX  blackcolor
   r     r  r  fillfontJPEGr  r   )r   rM   r   rT  r   r  r   r   r  rx   r9  getattrr  rY   r   r2   rF   r   r   newr   Drawr   load_defaultr  r   savebase64	b64encodegetvaluer   )r-   r   r   r   r   r   r   rb   r3  r   error_messageimgdrawr  buffereds                  r0   _generate_media_from_htmlz8AsyncPlaywrightCrawlerStrategy._generate_media_from_html  s      
0	1/1F"&"6"6"?"?QW"?"XXMD' ""44F"GGG zz!%!66###'#5#5d#;;
  --!--(B(BCCC.5f>[]a.b+(,(<(<6Q )= ) # #Hj8* **,&& S Y H 7 < D#6 '  -  	9B3q6(KMKK?/   &++iizA ~~c* --/		(Md	S"9&1"("2"283D3D3F"G"N"Nw"W"Hj88 **,&&   '	9& **,&&   sG  	J%-E D.E D0$E <D2=$E !D4">E  D6!'E D8	E J%D<'D:(D<,J%.E 0E 2E 4E 6E 8E :D<<	EJ%EJ%
I1C%I,:I1;I4 ?J%IIIJ%	I(%J%'I((J%,I11I4 4J"8JJJJ"	JJ"JJ""J%c                   K   | j                  |       d{   }|s| j                  |       d{   S  | j                  |fi | d{   S 7 :7 !7 w)z
        Take a screenshot of the current page.

        Args:
            page (Page): The Playwright page object
            kwargs: Additional keyword arguments

        Returns:
            str: The base64-encoded screenshot data
        N)page_need_scrolltake_screenshot_naivetake_screenshot_scroller)r-   r   r/   need_scrolls       r0   r  z.AsyncPlaywrightCrawlerStrategy.take_screenshotO  sa      !11$7733D999 766tFvFFF 8 : Gs1   AAAAAAAAAc                  K   	 ddl m}  ||      }|d   j                  d      }t               }|j	                  |d       t        j                  |j                               j                  d      S # t        $ r}dt        |       }| j                  j                  dd	d
|i       t        j                  ddd      }t        j                   |      }	t#        j$                         }
|	j'                  d|d|
       t               }|j	                  |d       t        j                  |j                               j                  d      cY d}~S d}~ww xY ww)z
        Convert the first page of the PDF to a screenshot.

        Requires pdf2image and poppler.

        Args:
            pdf_data (bytes): The PDF data

        Returns:
            str: The base64-encoded screenshot data
        r   )convert_from_bytesr  r  r  r   z%Failed to take PDF-based screenshot: zPDF Screenshot failed: {error}r   r   r   r  r  r  r  r  r  N)	pdf2imager  convertr   r  r  r  r  r   r   r2   rF   r   r   r  r   r  r   r  r  )r-   r   r  images	final_imgr  r   r  r  r  r  s              r0   take_screenshot_from_pdfz7AsyncPlaywrightCrawlerStrategy.take_screenshot_from_pdfd  s/    	I4'1Fq	))%0IyHNN8FN3##H$5$5$78??HH 	ICCF8LMKK8/   ))E:W=C>>#&D))+DIIhO$IOyHHHXfH-##H$5$5$78??HH	Is6   EA0A5 4E5	E>CEEEEEc                  K   	 | j                  |       d{   }|d   }|d   }t        ||j                  dt                    }|j	                  ||d       d{    g }|j
                  }|d   }	||	z  dz   }
t        |
      D ]  }||	z  }||
dz
  k(  r(||	z  }|dk(  r n|j	                  ||d       d{    |j                  d| d	       d{    t        j                  d
       d{    |j                  ddd       d{   }t        j                  t        |            j                  d      }|j                  |        |j	                  ||	d       d{    t!        d |D              }t        j"                  d|d   j$                  |f      }d}|D ]4  }|j'                  |j                  d      d|f       ||j(                  z  }6 t               }|j                  d      }|j+                  |dd       t-        j.                  |j1                               j3                  d      }|S 7 07 7 7 x7 ^7 E7 # t4        $ r}dt7        |       }| j8                  j;                  ddd|i       t        j"                  ddd      }t=        j>                  |      }tA        jB                         }|jE                  d|d|       t               }|j+                  |d        t-        j.                  |j1                               j3                  d      cY d}~S d}~ww xY ww)!am  
        Attempt to set a large viewport and take a full-page screenshot.
        If still too large, segment the page as before.

        Requires pdf2image and poppler.

        Args:
            page (Page): The Playwright page object
            kwargs: Additional keyword arguments

        Returns:
            str: The base64-encoded screenshot data
        Nr$  r#  r3  r%  r   r   zwindow.scrollTo(0, )g{Gz?FjpegU   )	full_pager  qualityr  c              3  4   K   | ]  }|j                     y wr+   )r#  )rD  r  s     r0   rE  zJAsyncPlaywrightCrawlerStrategy.take_screenshot_scroller.<locals>.<genexpr>  s     >cszz>s   BMP)r  r  r   z*Failed to take large viewport screenshot: z)Large viewport screenshot failed: {error}r   r   r   r  r  r  r  r  r  r  r  )#rf  rj  rw   r   ri  r  ranger   rx   r9  r   r   r   r   r  r   r  r  r$  paster#  r  r  r  r  r   r   r2   rF   r   r   r  r   r  r  )r-   r   r/   r  r  r  large_viewport_heightsegmentsr  r  num_segmentsr   y_offsetlast_part_heightseg_shotr  r  stitchedoffsetr  encodedr   r  r  r  s                            r0   r  z7AsyncPlaywrightCrawlerStrategy.take_screenshot_scroller  s?    O	I#77==J#G,J$X.K
 %(

8:TU%! (($0EF  
 H ..M+H5O'?:a?L<( %.q(('2_'D$ (1, 00:Qa1bcccmm&9(1$EFFFmmD))) "&5vWY!ZZjj!23;;EB$/%4 ((:)YZZZ>X>>Lyy!):):L(IJHF %s{{51Av;?#**$%
 yH''.HMM(5"M=&&x'8'8':;BB7KGN} >. dF) [ [   	IHQQMKKC/   ))E:W=C>>#&D))+DIIhO$IOyHHHXfH-##H$5$5$78??HH	Is   MI IAI IAI <I=I II 6I7I IAI .I/CI MI I I I I I I 	M%CL=7M8M=MMc                ^  K   	 |j                  d       d{   }t        j                  |      j                  d      S 7 (# t        $ r}dt        |       }| j                  j                  ddd|i	       t        j                  d
dd      }t        j                  |      }t        j                         }|j                  d|d|       t               }|j!                  |d       t        j                  |j#                               j                  d      cY d}~S d}~ww xY ww)z
        Takes a screenshot of the current page.

        Args:
            page (Page): The Playwright page instance

        Returns:
            str: Base64-encoded screenshot image
        F)r  Nr   zFailed to take screenshot: zScreenshot failed: {error}r   r   r   r  r  r  r  r  r  r  r  r  )r   r  r  r   r   r2   rF   r   r   r  r   r  r   r  r  r   r  r  )	r-   r   r   r   r  r  r  r  r  s	            r0   r  z4AsyncPlaywrightCrawlerStrategy.take_screenshot_naive  s    	I#??J##J/66w?? @ 	I9#a&BMKK4/   ))E:W=C>>#&D))+DIIhO$IOyHHHXfH-##H$5$5$78??HH!	IsE   D-A A'A D-A 	D*CD%D* D-%D**D-c                   K   | j                   rF| j                   j                  |       d{   }| j                  j                  ddd|i       |S | j                  j	                  dd	       y7 Dw)
a  
        Exports the current storage state (cookies, localStorage, sessionStorage)
        to a JSON file at the specified path.

        Args:
            path (str): The path to save the storage state JSON file

        Returns:
            dict: The exported storage state
        )r   Nz Exported storage state to {path}r7  r   r   z5No default_context available to export storage state.ri   rj   )re   storage_staterF   r  rm   )r-   r   r  s      r0   export_storage_statez3AsyncPlaywrightCrawlerStrategy.export_storage_state  s{      ..<<$<GGEKK:~  
 LKKO    Hs   ,A5A3AA5c                  K   	 |j                  d       d{    t        |t              r|g}n|}g }|D ]u  }	 d}	 | j                  j	                  |d| d       d{   }t        j                         }		 |j                  dd       d{    |j                  |r|nddi       w d|dS 7 7 T# t
        $ r,}dt        |      v r| j                  j                  dd       	 |j                  d	d
       d{  7   n?# t
        $ r3}| j                  j                  dddt        |      i       Y d}~nd}~ww xY w	 |j                  dd
       d{  7   n?# t
        $ r3}| j                  j                  dddt        |      i       Y d}~nd}~ww xY wddd}n7| j                  j                  dddt        |      i       dt        |      d}Y d}~d}~ww xY w7 `# t
        $ r4}| j                  j                  dddt        |      i       Y d}~d}~ww xY w# t        $ rQ}| j                  j                  dddt        |      i       |j                  dt        |      d       Y d}~Rd}~ww xY w# t        $ rA}| j                  j                  dddt        |      i       dt        |      dcY d}~S d}~ww xY ww)a  
        Executes user-provided JavaScript code with proper error handling and context,
        supporting both synchronous and async user code, plus navigations.

        How it works:
        1. Wait for load state 'domcontentloaded'
        2. If js_code is a string, execute it directly
        3. If js_code is a list, execute each element in sequence
        4. Wait for load state 'networkidle'
        5. Return results

        Args:
            page (Page): The Playwright page instance
            js_code (Union[str, List[str]]): The JavaScript code to execute

        Returns:
            Dict[str, Any]: The results of the execution
        r   Nz
                        (async () => {
                            try {
                                return await (async () => {
                                    a  
                                })();
                            } catch (err) {
                                return { success: false, error: err.toString(), stack: err.stack };
                            }
                        })();
                        zExecution context was destroyedz6Navigation triggered by script, waiting for load stater+  r   r   r   r   zNavigation wait failed: {error}r   r   r  z!Network idle wait failed: {error}Tz6Navigation triggered, ignoring context destroyed error)r*  r  #Playwright execution error: {error}Fr*  r   r  z!DOM content load timeout: {error}r*  zScript chunk failed: {error}r*  results Script execution failed: {error})r   rL   r2   rI   r   r   rF   r  rm   r   r   r   r   )
r-   r   r   scriptsr&  scriptr   r   nav_errt1s
             r0   rr  z9AsyncPlaywrightCrawlerStrategy.robust_execute_user_script$  s)    *	7**+=>>>'3'")!G! lHkH "FEI& (,||'<'<T!% &,H -	
( "j B"667ISW6XXX& NNV6)T9JKGlH\  $88m ?B" ! %I<AF KK,, X$- - "&*&>&>vu&>&U U U#( " $ 3 3,M(1,3S\+B !4 !" !""	"&*&>&>$15 '? '" !" !" $) " $ 3 3,O(1,3S\+B !4 !" !"" ,0(`&F !KK--(M$-(/Q'8 . 
 27Q%HFK%IT Y  ++$G )$+SV#4 ,  ( ! HKK%% >% 'Q0 & 
 NNus1v#FGGH  	7KK:Q(  
  %s1v66	7sR  KJ B4!J H4#B8!B6"B8&H4;G4G1G4H4-J 3K4J 6B88
G.*G)-D
DD
	G)
	E)E<G)EG)
E' E#!E'&G)'	F#0)FG)F##A G)#H4)G..H41G44	H1=)H,&H4,H11H44	J=AJ	J 	JJ 	K6KKKKKc           
       K   	 |j                  d       d{    t        |t              r|g}n|}g }|D ]  }	 | j                  j	                  |d| d       d{   }t        j
                         }|j                  dd       d{    t        j
                         }|j                  dd       d{    |j                  |r|ndd	i        d	|dS 7 7 7 U7 (# t        $ rQ}| j                  j                  d
ddt        |      i       |j                  dt        |      d       Y d}~d}~ww xY w# t        $ rA}| j                  j                  dddt        |      i       dt        |      dcY d}~S d}~wt        $ rA}| j                  j                  dddt        |      i       dt        |      dcY d}~S d}~ww xY ww)a;  
        Executes user-provided JavaScript code with proper error handling and context.

        Args:
            page: Playwright page object
            js_code: Single JavaScript string or list of JavaScript code strings

        Returns:
            Dict containing execution status and results/errors
        r   Nz
                        (() => {
                            return new Promise((resolve) => {
                                try {
                                    const result = (function() {
                                        a'  
                                    })();
                                    
                                    // If result is a promise, wait for it
                                    if (result instanceof Promise) {
                                        result.then(() => {
                                            // Wait a bit for any triggered effects
                                            setTimeout(() => resolve({ success: true }), 100);
                                        }).catch(error => {
                                            resolve({
                                                success: false,
                                                error: error.toString(),
                                                stack: error.stack
                                            });
                                        });
                                    } else {
                                        // For non-promise results, still wait a bit for effects
                                        setTimeout(() => resolve({ success: true }), 100);
                                    }
                                } catch (error) {
                                    resolve({
                                        success: false,
                                        error: error.toString(),
                                        stack: error.stack
                                    });
                                }
                            });
                        })()
                    r  r   r  r*  Tr#  r+  r   r   Fr$  r%  r'  )r   rL   r2   rI   r   r   r   r   rF   r   r   )	r-   r   r   r(  r&  r)  r   r+  r   s	            r0   execute_user_scriptz2AsyncPlaywrightCrawlerStrategy.execute_user_script  s    V	7**+=>>> '3'")!G! 9H8H#'<<#8#8)
 *0 1!#$ #FL B223Et2TTT B22=$2OOONNV6)T9JKa9Hv  $88I ?#N U P  HKK%% E% 'Q0 & 
 NNus1v#FGGH  	7KK:Q(  
  %s1v66 	7KK:Q(  
  %s1v66	7s   GE C!E #C'C!.C'C#.C'<C%=C'E GE !C'#C'%C''	E0AD<6E <EE 	G6F	GG	G6GGGGGc                V   K   | j                   j                  |d       d{   S 7 w)z
        Checks if an element is visible on the page.

        Args:
            page: Playwright page object

        Returns:
            Boolean indicating visibility
        a  
            () => {
                const element = document.body;
                if (!element) return false;
                const style = window.getComputedStyle(element);
                const isVisible = style.display !== 'none' && 
                                style.visibility !== 'hidden' && 
                                style.opacity !== '0';
                return isVisible;
            }
        NrI   r   r-   r   s     r0   r`  z/AsyncPlaywrightCrawlerStrategy.check_visibility  s0      \\**4

 
 	
 
    )')c                   K   | j                  |||       d{   }|d   r|j                  |dz         d{    |S 7 '7 w)z
        Safely scroll the page with rendering time.

        Args:
            page: Playwright page object
            x: Horizontal scroll position
            y: Vertical scroll position
        Nr*  r"  )csp_scroll_tory  )r-   r   xyr8  r   s         r0   r  z*AsyncPlaywrightCrawlerStrategy.safe_scroll7  sM      ))$155)''555 65s   AA AAAAc                  K   	 | j                   j                  |d| d| d| d| d| d| d       d{   }|d	   s/| j                  j                  d
dd|j	                  d      i       |S 7 :# t
        $ rA}| j                  j                  dddt        |      i       dt        |      dcY d}~S d}~ww xY ww)a9  
        Performs a CSP-compliant scroll operation and returns the result status.

        Args:
            page: Playwright page object
            x: Horizontal scroll position
            y: Vertical scroll position

        Returns:
            Dict containing scroll status and position information
        z() => {
                    try {
                        const startX = window.scrollX;
                        const startY = window.scrollY;
                        window.scrollTo(z, a  );
                        
                        // Get final position after scroll
                        const endX = window.scrollX;
                        const endY = window.scrollY;
                        
                        return {
                            success: true,
                            startPosition: { x: startX, y: startY },
                            endPosition: { x: endX, y: endY },
                            targetPosition: { x: z, y: z\ },
                            delta: {
                                x: Math.abs(endX - z6),
                                y: Math.abs(endY - a  )
                            }
                        };
                    } catch (e) {
                        return {
                            success: false,
                            error: e.toString()
                        };
                    }
                }Nr*  z Scroll operation failed: {error}SCROLLr   r   z!Failed to execute scroll: {error}Fr$  )rI   r   rF   rm   rw   r   r   r2   )r-   r   r4  r5  r   r   s         r0   r3  z,AsyncPlaywrightCrawlerStrategy.csp_scroll_toE  s     .	7<<00) *+2aS 
13 45#U1# >4453 74453 	7# F< )$##> #VZZ%89 $  MKN  	7KK;Q(  
  %s1v66	7sD   C 2A3 A19A3 0C 1A3 3	B=<6B82B=3C 8B==C c                V   K   | j                   j                  |d       d{   S 7 w)z
        Get the dimensions of the page.

        Args:
            page: Playwright page object

        Returns:
            Dict containing width and height of the page
        z
            () => {
                const {scrollWidth, scrollHeight} = document.documentElement;
                return {width: scrollWidth, height: scrollHeight};
            }
        Nr/  r0  s     r0   rf  z2AsyncPlaywrightCrawlerStrategy.get_page_dimensions  s0      \\**4
 
 	
 
r1  c           	        K   	 | j                   j                  |d       d{   }|S 7 # t        $ r3}| j                  j	                  dddt        |      i       Y d}~yd}~ww xY ww)z
        Determine whether the page need to scroll

        Args:
            page: Playwright page object

        Returns:
            bool: True if page needs scrolling
        z
            () => {
                const scrollHeight = document.documentElement.scrollHeight;
                const viewportHeight = window.innerHeight;
                return scrollHeight > viewportHeight;
            }
            NzDFailed to check scroll need: {error}. Defaulting to True for safety.r7  r   r   T)rI   r   r   rF   rm   r2   )r-   r   r  r   s       r0   r   z/AsyncPlaywrightCrawlerStrategy.page_need_scroll  sw     	 $ 5 5d! K   	KK^Q(   
 	s6   A+, *, A+, 	A()A#A+#A((A+)NNN)rE   r   rF   r   rN   r!   )ro   r2   )rs   r2   rt   r   )rs   r2   )r|   r2   )r   zDict[str, str])r   )r   r   r   r2   r   rG  )r   r   r   r2   r   rG  r3   r2   r.   r2   r   r   r3   r   )r!  N)r   r   rn  rG  ro  zOptional[int])r   r   r   z'VirtualScrollConfig')r   r   r3   None)r   r   r3   bytes)r   r   r3   zOptional[str])r   r   r  r2   r3   z"List[Dict[str, Union[str, float]]]r+   )r   r2   r   r   r3   tuple)r   r=  r3   r2   )r   r   r3   r2   )r   r2   r3   r   )r   r   r   zUnion[str, List[str]]r3   Dict[str, Any])r!  )r   r   r4  rh  r5  rh  r8  rG  )r   r   r4  rh  r5  rh  r3   r?  )r   r   )r   r   r3   bool)(r4   r5   r6   r7   rO   rT   r^   rR   rY   rn   ru   rc   r}   r   r   r   r   r   r1   r   rm  rq  r  r   r  r   r  r  r  r
  r  r  r!  rr  r-  r`  r  r3  rf  r   r,   r8   r0   r:   r:   -   s   : sw*
+*
<G*
ao*
X	
3<"@4)(
%
>B EJ,,.1,<A,\>@0II 0I	IVO	#O	# 0O	#	O	#dL:\hV/b!F>@" " %(" 	+" J 59EE!1E	ENG*"IH]IBID2T7T7#8T7	T7lc7c7#8c7	c7J
0:7x
&r8   r:   c                      e Zd ZdZy)HTTPCrawlerErrorz5Base error class for HTTP crawler specific exceptionsNr4   r5   r6   r7   r,   r8   r0   rB  rB    s    ?r8   rB  c                      e Zd ZdZy)ConnectionTimeoutErrorz%Raised when connection timeout occursNrC  r,   r8   r0   rE  rE    s    /r8   rE  c                  $     e Zd ZdZd fdZ xZS )HTTPStatusErrorz"Raised for unexpected status codesc                >    || _         t        | 	  d| d|        y )NzHTTP r   )r   superrO   )r-   r   rk   	__class__s      r0   rO   zHTTPStatusError.__init__  s%    &5Ry9:r8   )r   rh  rk   r2   )r4   r5   r6   r7   rO   __classcell__)rJ  s   @r0   rG  rG    s    ,; ;r8   rG  c            	         e Zd ZU dZdZdZded<   dZded<    ed e	j                         xs d	d
z        Zded<   dZded<    eh d      Zded<    eddddddd      Zded<   ddeeef	 	 	 	 	 	 	 	 	 d(dZd)dZd*dZej*                  d        Zd+dZ	 	 	 	 	 	 	 	 	 	 d,dZd*d Zd*d!Zd-d"Zd.d#Zd/d$Zd0d%Z	 	 	 	 	 	 d1d&Z	 d2	 	 	 	 	 d3d'Z y)4AsyncHTTPCrawlerStrategyzW
    Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency.
    )rF   max_connectionsdns_cache_ttl
chunk_size_sessionrK   rE      z
Final[int]DEFAULT_TIMEOUTi   DEFAULT_CHUNK_SIZEr  r   r   DEFAULT_MAX_CONNECTIONS,  DEFAULT_DNS_CACHE_TTL>   rawfilehttphttpsr   VALID_SCHEMESz?text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8zen-US,en;q=0.5zgzip, deflate, brz
keep-alive1z<Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36)AcceptzAccept-LanguagezAccept-Encoding
ConnectionzUpgrade-Insecure-Requestsz
User-Agent_BASE_HEADERSNc                @   |xs
 t               | _        || _        || _        || _        || _        d| _        dD ci c]  }|t        | j                  |       c}| _	        | j                  dd        | j                  dd        | j                  dd        yc c}w )	z'Initialize the HTTP crawler with configN)before_requestafter_requeston_errorrb  c                      y r+   r,   rz   r/   s     r0   r  z3AsyncHTTPCrawlerStrategy.__init__.<locals>.<lambda>      r8   rc  c                      y r+   r,   rf  s     r0   r  z3AsyncHTTPCrawlerStrategy.__init__.<locals>.<lambda>  rg  r8   rd  c                      y r+   r,   rf  s     r0   r  z3AsyncHTTPCrawlerStrategy.__init__.<locals>.<lambda>  rg  r8   )r   rE   rF   rN  rO  rP  rQ  r'   _execute_hookrK   ru   )r-   rE   rF   rN  rO  rP  ks          r0   rO   z!AsyncHTTPCrawlerStrategy.__init__  s     -C0A0C.*$9= E
 wt))1--

 	&(DEo'CDj">?
s   Bc                B   K   | j                          d {    | S 7 wr+   rQ   rS   s    r0   rT   z#AsyncHTTPCrawlerStrategy.__aenter__  rU   rV   c                @   K   | j                          d {    y 7 wr+   rX   rZ   s       r0   r^   z"AsyncHTTPCrawlerStrategy.__aexit__  r_   r`   c                  K   	 | j                   s| j                          d {    | j                    y 7 # w xY wwr+   )rQ  rR   rS   s    r0   _session_contextz)AsyncHTTPCrawlerStrategy._session_context  s9     	==jjl""-- # s    ?: 8: ?: <?c                    || j                   v r%t        | j                  ||      | j                   |<   y t        d|       )Nrq   )rK   r'   rj  rr   )r-   rs   	hook_funcs      r0   ru   z!AsyncHTTPCrawlerStrategy.set_hook	  s>    

"$+D,>,>	9$UDJJy!29+>??r8   c                h   K   t        j                  |      r ||i | d {   S  ||i |S 7 wr+   )rx   ry   )r-   rs   rq  rz   r/   s        r0   rj  z&AsyncHTTPCrawlerStrategy._execute_hook	  s>      &&y1"D3F333$)&)) 4s   !202c                  K   | j                   srt        j                  | j                  | j                  dd      }t        j
                  t        | j                        |t        | j                              | _         y y w)NTF)limitttl_dns_cacheuse_dns_cacheforce_close)total)r   	connectorr   )
rQ  aiohttpTCPConnectorrN  rO  ClientSessionr   r`  r$   rS  )r-   ry  s     r0   rR   zAsyncHTTPCrawlerStrategy.start	  sm     }},,**"00"!	I $11T//0#%D,@,@ADM s   B Bc                r  K   | j                   rW| j                   j                  s@	 t        j                  | j                   j	                         d       d {    d | _         y y y 7 # t        j
                  $ r, | j                  r| j                  j                  dd       Y Jw xY w# d | _         w xY ww)NrF  r   zSession cleanup timed outCLEANUPrj   )rQ  closedrx   r   rY   r   rF   rm   rS   s    r0   rY   zAsyncHTTPCrawlerStrategy.close'	  s     ==!5!5	%&&t}}':':'<cJJJ !% "6=J'' ;;KK'' ;% (  !%sF   #B72A) A'A) 
B7'A) )<B(%B+ 'B((B+ +	B44B7c               X  K   t        j                  |d      4 d {   }|j                  | j                         d {   x}r4t	        |       |j                  | j                         d {   x}r4d d d       d {    y 7 n7 M7 7 # 1 d {  7  sw Y   y xY ww)Nrb)mode)aiofilesr   r   rP  
memoryview)r-   r   r   chunks       r0   _stream_filez%AsyncHTTPCrawlerStrategy._stream_file4	  s     ==D1 	( 	(Q!"!888%8 '' "#!888%8	( 	( 	(88	( 	( 	( 	(sh   B*BB*BB2B4B5B<B*BB*BBB*B'BB'#B*c                >  K   t         j                  j                  |      st        d|       g }| j	                  |      2 3 d {   }|j                  |j                         j                  dd             :7 56 t        dj                  |      i d      S w)Nr   r   r   r    r   r   r   r   )
r   r   r   r   r  r   tobytesr   r   r~  )r-   r   r  r  s       r0   _handle_filez%AsyncHTTPCrawlerStrategy._handle_file9	  s     ww~~d##&<TF$CDD,,T2 	M 	M%MM%--/000KL	M2 "
 	
s*   ABA?A=A?2B=A??Bc                &   K   t        |i d      S w)Nr   r  r   )r-   r:  s     r0   _handle_rawz$AsyncHTTPCrawlerStrategy._handle_rawG	  s     !
 	
s   c                    |sy|j                   }t        |dd      }t        |dd      }|r3|r1d|v r"|j                  dd      \  }}| d| d| d| S d| d| d| S |S )	z5Format ProxyConfig into aiohttp-compatible proxy URL.Nusernamepasswordz://r   :@r   )serverr  r|  )r-   proxy_configr  r  r  protocolrests          r0   _format_proxy_urlz*AsyncHTTPCrawlerStrategy._format_proxy_urlO	  s    $$<T:<T:!'eQ!7$"3xj(1TFCC 
!H:Qvh??r8   c           
     X	  K   | j                         4 d {   }t        |j                  xs | j                  dd      }t	        | j
                        }| j                  j                  r%|j                  | j                  j                         || j                  j                  | j                  j                  |d}d }|j                  r | j                  |j                        }||d<   | j                  j                  dk(  r^| j                  j                  r| j                  j                  |d<   | j                  j                  r| j                  j                  |d<    | j                   d	   ||       d {    	  |j"                  | j                  j                  |fi |4 d {   }t%        |j'                          d {         }	d
|j(                  cxk  rdk  sn t+        |j(                  d|       |j,                  }
|
s*t/        j0                  |	j3                               d   xs d}
t5        |	j3                         j7                  |
d      t	        |j                        |j(                  t9        |j:                              } | j                   d   |       d {    |cd d d       d {    cd d d       d {    S 7 7 b7 37 7 47 $7 # 1 d {  7  sw Y   nxY wn|# t<        j>                  $ r:} | j                   d   |       d {  7   tA        dt9        |             d }~wt<        jB                  $ r:} | j                   d   |       d {  7   tE        dt9        |             d }~wt<        jF                  $ r:} | j                   d   |       d {  7   tI        dt9        |             d }~wtJ        jL                  jN                  $ r:} | j                   d   |       d {  7   tA        dt9        |             d }~wtP        $ r:} | j                   d   |       d {  7   tI        dt9        |             d }~ww xY wd d d       d {  7   y # 1 d {  7  sw Y   y xY ww)Nr  rR  )rx  connect	sock_read)r   allow_redirectssslr   proxyPOSTr  jsonrb  r   rV  zUnexpected status code for r   r   r   r   )r   r   r   r?  rc  rd  zRequest timed out: zConnection failed: zHTTP client error: zHTTP request failed: ))ro  r$   r]  rS  r   r`  rE   r   updatefollow_redirects
verify_sslr  r  r   r  r  rK   r   r  r   r  rG  charsetchardetdetectr  r   r   r2   r.   rz  ServerTimeoutErrorrE  ClientConnectorErrorConnectionErrorClientErrorrB  rx   
exceptionsr   r   )r-   r.   r   sessionr   r   request_kwargs	proxy_urlr   r:  r   r   r   s                r0   _handle_httpz%AsyncHTTPCrawlerStrategy._handle_httpb	  s1    
 ((* J	I J	Ig#))AT-A-AG 4--.G""**t22::; ##'#6#6#G#G**55"	N I"" 2263F3FG	*3w'""))V3&&++-1-@-@-E-EN6*&&++-1-@-@-E-EN6*.$**-.sNCCC*I*7??4+>+>+E+Es]n] " "ai(x}})>?G8??8S8-$OO9#? 
  (//H##*>>'//2C#DZ#P#[T[/$__.55hy5Q)-h.>.>)?$,OO'*8<<'8	F 6$**_5f===!+" " "CJ	I J	I J	I< D")>& >)"CJ	IB" " " ". -- M,djj,Q///,/B3q6(-KLL// F,djj,Q///%(;CF8&DEE&& G,djj,Q///&)<SVH'EFF%%22 M,djj,Q///,/B3q6(-KLL I,djj,Q///&)>s1vh'GHHIQJ	I J	I J	I J	I J	Isf  R*KR*E$R?K! R,L1K$2L5K0K'C&K05K*6K0;LK,LR*K.R*!R$L'K0*K0,L.R*0L	6K97L	>LRR M2L53MR %N=N >NR 0O%O	O%% R P:P P::R Q;Q!Q;;R  RR*RR*R'RR'#R*c                T  K   |xs t        j                  |      }t        |      }|j                  j	                  d      }|| j
                  vrt        d|       	 |dk(  r#| j                  |j                         d {   S |dk(  r4|j                  d      r|dd  n|dd  }| j                  |       d {   S | j                  ||       d {   S 7 W7  7 # t        $ r<}| j                  r*| j                  j                  dd	t        |      |d
        d }~ww xY ww)N/zUnsupported URL scheme: rY  rX  r   r   r   zCrawl failed: {error}CRAWL)r   r.   r   )r   rH   r%   schemerstripr\  rr   r  r   r   r  r  r   rF   r   r2   )r-   r.   r   r/   parsedr  raw_contentr   s           r0   r1   zAsyncHTTPCrawlerStrategy.crawl	  s%     ?+77?#%%c*+++7x@AA	!..v{{;;;5 *-)Ac!"gs12w!--k:::!..sF;;; <
 ;; 	{{!!3%(VC8 " 
 	sx   AD(#C  CC  D(4C  ;C<C  ?D( C  CC  D(C  C  C   	D%)7D  D%%D()
rE   zOptional[HTTPCrawlerConfig]rF   zOptional[AsyncLogger]rN  rh  rO  rh  rP  rh  )r3   rM  )r3   r<  )rs   r2   rq  r   r3   r<  )
rs   r2   rq  r   rz   r   r/   r   r3   r   )r   r2   r3   z AsyncGenerator[memoryview, None])r   r2   r3   r   )r:  r2   r3   r   r:  r;  r+   )r.   r2   r   zOptional[CrawlerRunConfig]r3   r   )!r4   r5   r6   r7   	__slots__rS  __annotations__rT  rj  r   	cpu_countrU  rW  	frozensetr\  r&   r`  rO   rT   r^   
contextlibasynccontextmanagerro  ru   rj  rR   rY   r  r  r  r  r  r1   r,   r8   r0   rM  rM    s    tI"$OZ$%.
.*-b<2<<>3FQ!2K*LZL(+:+$%EFM5F+S+."%(T- M5  7;(,62,@3@ &@ 	@
 @ @6 ## $@	*	* 	* 		*
 	* 
	*%(


&OIOI !OI 
	OIh .2   + 
 
 r8   rM  )H
__future__r   rx   r  r   abcr   r   typingr   r   r   r	   r
   r   r   r   r   playwright.async_apir   r   r   rz  ior   PILr   r   r   rW  r   
js_snippetr   modelsr   r   r   r  r   r   r   async_loggerr   r=  r   user_agent_generatorr   rM   r    rN   r!   r"   r#   r  rz  r  aiohttp.clientr$   urllib.parser%   typesr&   r  	functoolsr'   r)   r:   r   rB  rE  rG  rM  r,   r8   r0   <module>r     s    "    # 3 3 2 2 	 , G  + +   & & . M M % + 2 + Q Q    ( ! "  3 B"%9 B"RD	y 	
	- 	
;& ;L3 Lr8   