
    ,iL                        d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlZddlmZ  ej                  e      ZdZ ej$                  d      Z ej$                  d	      Zd
ZdeiZdedefdZdej2                  dedee   fdZ	 ddej2                  dededee   fdZdej2                  dedee   fdZ	 	 ddedededeej2                     def
dZy)u   
scraper.py — Data collection from PageSpeed Insights, Brave Search, and direct web scraping.

All functions return dict or None on failure. Designed for async batch processing.
    N)Optional)urlparse
quote_plus)BeautifulSoupBSAmQFROygK29bcFmz88zpOXRwnHk3Q   total   zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36z
User-Agenturlreturnc                 X    | sy| j                         } | j                  d      sd| z   } | S )z#Ensure URL has scheme and is clean. http://https://r   )strip
startswith)r   s    6/mnt/e/genesis-system/scripts/heatmap_audit/scraper.py_normalize_urlr   $   s/    
))+C>>123J    sessionc           	      t  K   t        |      }|syi }t        d      D ]  \  }}|dkD  rt        j                  d       d{    dt	        |       d| d}	 d}t        d      D ]  }| j                  |t        	      4 d{   }|j                  d
k(  rMd|z  dz   }	t        j                  d|	       t        j                  |	       d{    	 ddd      d{    |j                  dk7  r5t        j                  d||j                  |       	 ddd      d{     n,|j                          d{   }	 ddd      d{     n |<|j                  di       j                  di       }
|
j                  di       }|
j                  di       }|
j                  di       }|
j                  di       }|j                  di       j                  di       }|j                  di       j                  dd      }|j                  di       j                  dd      }|j                  di       j                  dd      }|j                  di       j                  dd      }|j                  di       j                  dd      }|j                  di       }|j                  dd      dk(  }|}t        |j                  dd      dz        || d <   t        |j                  dd      dz        || d!<   t        |j                  dd      dz        || d"<   t        |j                  dd      dz        || d#<   t        |      || d$<   t        |      || d%<   t        |      || d&<   t        |      || d'<   t        |d      || d(<   |d)k(  r||d*<    |r|S dS 7 p7 07 7 7 7 7 v# 1 d{  7  sw Y   sxY w# t        j                  $ r t        j                  d+||       Y t        $ r#}t        j                  d,|||       Y d}~d}~ww xY ww)-z
    Fetch Google PageSpeed Insights data for a URL.
    Returns dict with mobile_score, desktop_score, load_time, fcp, lcp, etc.
    N)mobiledesktopr   g      ?zKhttps://pagespeedonline.googleapis.com/pagespeedonline/v5/runPagespeed?url=z
&strategy=zQ&category=PERFORMANCE&category=ACCESSIBILITY&category=BEST_PRACTICES&category=SEO   )timeouti        z&PageSpeed rate limited, waiting %ds...   zPageSpeed %s returned %d for %slighthouseResult
categoriesperformanceseoaccessibilityzbest-practicesauditszfirst-contentful-paintnumericValuezlargest-contentful-paintzspeed-indexinteractivezcumulative-layout-shiftviewportscored   _performance_score
_seo_score_accessibility_score_best_practices_score_fcp_ms_lcp_ms_speed_index_ms_tti_ms_clsr   has_mobile_viewportzPageSpeed %s timed out for %szPageSpeed %s error for %s: %s)r   	enumerateasynciosleepr   rangegetPAGESPEED_TIMEOUTstatusloggerinfowarningjsonroundTimeoutError	Exception)r   r   resultsistrategyapi_urldataattemptrespwaitr"   perfseo_cata11ybpr&   fcplcpspeed_indextticls_valviewport_audithas_viewportprefixes                            r   fetch_pagespeedrY   1   sw     
CG !67 KN8q5--$$$sO$Jxj_a 	
C	ND 8 ";;%6 '   {{c) G|a/$LdS%mmD111    {{c)=xVY    "&,D  " | "4b9==lBOJ>>-4D nnUB/G>>/26D 0"5B XX0"599(BGF**5r:>>~qQC**7<@@QRSC **]B7;;NANK**]B/33NAFCjj!:B?CCNTUVG $ZZ
B7N)--gq9Q>LF5:!$s*6Gvh012 .3GQ'#-.Gvhj)* 8=!$s*8Gvh234 9>w"S(9Gvh345 +0*Gvhg&'*/*Gvhg&'272DGvho./*/*Gvhg&'',Wa'8GvhdO$8#1=-.MKNZ 7'4'W % 2  -   x ## 	KNN:HcJ 	NNN:Hc1MM	Ns   A P8N2P8+ON5
O	AON8
OO N;!O&2OO#N>$O*O=O
>OOO	OP8IO)
P85O8O;O>OOOOOOO*P5P8
P5P0*P80P55P8business_namelocationc           
        K   |sy| d| }dt        |       d}ddt        d}	 | j                  ||t              4 d{   }|j                  d	k7  r3t
        j                  d
|j                  |       	 ddd      d{    y|j                          d{   }|j                  di       j                  dg       }|j                  di       j                  dd      }	ddddddddd}
g }d}d}g d}|D ]  }|j                  dd      j                         }|j                  dd      j                         }|
j                         D ]  \  }}||v s|j                  |        d|v sd|v rd}|D ]
  }||v sd} n d |v sd!|v sd} d}|D ]i  }|j                  dd      }t        j                  d"|t        j                        r|d#z  }t        j                  d$|t        j                        se|d#z  }k t        |	d%      t        |      t!        t#        |            t        t#        |            |||d&cddd      d{    S 7 7 7 7 # 1 d{  7  sw Y   yxY w# t$        j&                  $ r t
        j                  d'|       Y yt(        $ r!}t
        j                  d(||       Y d}~yd}~ww xY ww))z~
    Search Brave for the business to assess online presence.
    Returns result_count, social_links found, snippet data.
    N z1https://api.search.brave.com/res/v1/web/search?q=z	&count=10zapplication/jsongzip)AcceptzAccept-EncodingzX-Subscription-Token)headersr   r    zBrave Search returned %d for %swebrD   totalEstimatedMatchesr   Facebook	InstagramTwitterz	Twitter/XLinkedInYouTubeTikTokYelp)facebook.cominstagram.comtwitter.comx.comlinkedin.comyoutube.com
tiktok.comzyelp.comF)
yellowpages	truelocalhotfroglocalsearchoneflarehipagesserviceseeking	airtaskerproductreviewwordofmouthr   r   descriptionzgoogle.com/mapszbusiness.googleTzgoogle reviewzgoogle ratingz\d+\s*reviewr   z\d(\.\d)?\s*stari'  )search_result_counttop_resultssocial_profilessocial_profile_counthas_google_businesshas_directory_listingsreview_mentionszBrave Search timed out for %szBrave Search error for %s: %s)r   BRAVE_API_KEYr:   REQUEST_TIMEOUTr<   r=   r?   r@   loweritemsappendresearch
IGNORECASEminlenlistsetr7   rB   rC   )r   rZ   r[   queryrG   r`   rJ   rH   web_resultstotal_resultssocial_platformssocial_linksr   r   directory_domainsresult
result_urlresult_descdomainplatformdr   descrX   s                           r   fetch_brave_searchr      s     oQxj)EA*UBSATT]^G %! -GN;;Wo  
 F	 F	{{c!@$++uUF	 F	 F	 $D((5"-11)R@K HHUB/334KQOM !+!,($ *(&"	  L"'%*"! & /#ZZr288:
$jj;AAC(8(>(>(@ 6$FH+$++H56 %
26G:6U*.'* AJ15. #k1_5S*.'%/*  O% )zz-499_dBMMB#q(O990$F#q(O) (+=%'@";/#'L(9#:(+C,=(>':*@#2}F	 F	 F	 F	 %F	 F	 F	 F	P  6F 6qIs   $KI6 II6 2I!9I6 II6 	K
I!IB<I!&I!I!A,I!=AI!I6 II6 KI6 I6 I!I6 !I3'I*(I3/I6 2K3I6 6)K
K!K
)K KK

Kc           	        :K   t        |      }|sy	 t        |      }|j                  dk(  }|}|sh|j                  ddd      }	 | j	                  |t        j                  d      d	      4 d{   }|j                  d
k  rd}|}ddd      d{    | j                  |t        t        dd      4 d{   }|j                  d
k\  rBt        j                  d|j                  |       ||j                  ddcddd      d{    S |j                  d       d{   }t        |j                         }	|	j#                  d      rd}ddd      d{    t%        d      }
|
j'                  d      }|r|j)                  d      nd}d}|
j'                  ddt+        j,                  dt*        j.                        i      }|r|j                  dd      }|
j'                  ddt+        j,                  dt*        j.                        i      }|du}|
j1                  d      }|
j1                  d      }t3        |      }t3        |      }|r|d   j)                  d      nd}|
j1                  d      }t5        d |D              }t3        |      }|
j1                  d d!      }|D cg c]F  }|d"   j#                  d#      r0t        |d"         j6                  t        |      j6                  k7  r|H }}g d$}g }|D ]Q  }|j                  d"d      j9                         }|D ]*  } | |v s|j;                  | j=                  d%      d          , S |
j1                  d&d'd(i      }!t3        |!      dkD  }"|
j1                  dd)t+        j,                  d*t*        j.                        i      }#t3        |#      dkD  }$g d+}%|j9                         :t?        :fd,|%D              }&g d-}'t?        :fd.|'D              }(t+        j,                  d/      })|)jA                  |      }*t3        |*      dkD  }+|
j'                  d0      },|,r|,j)                  d1d2      nd}-t3        |-j=                               }.i d3d4d
d5d6d4d
d5d7d8d9d5d:d8d9d5d;d<d=d5d>d?d@d5dAdAdBd5dCdDd@d5dEdFd=d5dGdHd=d5dIdJdKd5dLdMdNd5dOdPdQd5dRdSdTd5dUdVdWd5dXdYdQd5dZd[d\d5d]d^dTd5i}/d}0d}1|
j'                  ddt+        j,                  d_t*        j.                        i      }2|2r |2j                  dd      j9                         nd}3|
j'                  d`      }4|4r!|4j)                  d1d2      j9                         nd}5: d1|3 d1|5 }6|/jC                         D ]  \  }7}8|7|6v s|8d   }0|8da   }1 n i dbddcd@dd|de|df|dd@ dgtE        |      dh|dd= ditE        |      dj|dk|dd@ dl|dm|dn|do|"dp|$dq|&dr|(|+tG        tI        |            t3        |      |.	|0|1dsS 7 7 # 1 d{  7  sw Y   xY w# t        $ r Y w xY w7 7 I7 17 # 1 d{  7  sw Y   xY wc c}w # tJ        jL                  $ r t        j                  dt|       dddducY S t        $ r'}9t        j                  dv||9       dddducY d}9~9S d}9~9ww xY ww)wzO
    Scrape basic website data: SSL, viewport, title, meta, headings, etc.
    Nhttpsr   r   r      r	   T)r   allow_redirectsi  F)r`   r   r   sslzWebsite returned %d for %s)has_sslstatus_codeis_reachablereplace)errorszhtml.parsertitle)r   r   metanamer{   )attrscontentr)   h1h2r   imgc              3   D   K   | ]  }|j                  d       rd  yw)altr   N)r:   ).0r   s     r   	<genexpr>z'fetch_website_basics.<locals>.<genexpr>H  s      KsCGGEN Ks     a)hrefr   r   )rj   rk   rl   rm   rn   ro   rp   .scripttypezapplication/ld+jsonpropertyz^og:)
livechattawkintercomdriftcrispzendeskhubspotolark	freshchattidioc              3   &   K   | ]  }|v  
 y wN )r   cipage_text_lowers     r   r   z'fetch_website_basics.<locals>.<genexpr>m  s     NbO3N   )calendlyacuitybookingsscheduleappointmentzbook-nowbook_nowc              3   &   K   | ]  }|v  
 y wr   r   )r   bir   s     r   r   z'fetch_website_basics.<locals>.<genexpr>t  s     MB"/Mr   zJ(\+?\d{1,4}[\s\-]?\(?\d{1,4}\)?[\s\-]?\d{2,4}[\s\-]?\d{2,4}[\s\-]?\d{0,4})bodyr]   )	separatorr   rt   LocalSearch)r   est_costzlocalsearch.com.ausensiszSensis/Yellow Pagesi^  zyellowpages.com.auzyell.comYelli,  ru   Oneflarer    rv      servicecentralServiceCentralrr   zTrueLocal/SensiswebjetWebjetsquarespacezSquarespace (DIY)(   zwix.comz	Wix (DIY)#   weeblyzWeebly (DIY)r   zgodaddy.com/websiteszGoDaddy Builder   shopifyShopify2   zwordpress.comzWordPress.comz
wp-contentzWordPress (self-hosted)   zdeveloper.starterzGoDaddy Website Builder	generatorfooterr   r   r   r   r5   	title_tag	has_titlemeta_descriptionhas_meta_descriptionh1_counth1_texth2_counttotal_imagesimages_without_althas_schema_markuphas_og_tagshas_chat_widgethas_booking_system)has_phone_visiblepage_social_linksexternal_link_count
word_count	final_urldetected_providerest_monthly_costzWebsite timed out for %s)r   r   r   zWebsite scrape error for %s: %s)'r   r   schemer   headaiohttpClientTimeoutr<   rC   r:   HEADERSr   r=   r?   textstrr   r   r   findget_textr   compileIfind_allr   sumnetlocr   r   splitanyfindallr   boolr   r   r7   rB   );r   r   parsedr   	fetch_url	https_url	test_resprJ   htmlr   soupr   
title_text	meta_descmeta_tagviewport_tagrV   h1_tagsh2_tagsr   r   r   imagesr   r   	all_linksr   external_linkssocial_domainsr   linkr   sdschema_scripts
has_schemaog_tagsr   chat_indicatorsr   booking_indicatorshas_bookingphone_patternphones_on_pager   r   	body_textr   provider_signaturesr   r   meta_generatorgenerator_text	footer_elfooter_text
check_textsigr>   rX   r   s;                                                             @r   fetch_website_basicsr0     sH     
CCK#--7* 	Iz1=I	"<<w'<'<1'E$( (  . .  ''#-"&$-	. . ;;#   
 	 	 {{c!;T[[#N&#';;$)	 	 	 )44DDHHI ##J/)	 	, T=1 IIg&	7@Y''d'3b
 	99VFBJJ~rtt4T+U9V Y3I yy

;PRPTPT8U/VyW#4/ --%--%w<w<5<'!*%%D%1" u%  Kf KK6{ MM#DM1	 
y##$;<6#**hy.A.H.HH 
 

  	?D88FB'--/D$ ?:%,,RXXc]1-=>?	? x@U7VW(1,
 --z2::grtt;T.U-V'lQ&
 **,NoNN
 M:LMM 

Y
 '..t4/!3 yy @DDMMCtM<"	*+

MsC
 =c"J
 4#F
 !+@c"R	

 S9
 =
 	s;
 '7SI
 "4#F
 xS9
 $7RH
 <
 ~2>
 #->B$O
 	r:
  o2F!
" #<#N#
$  *CQS!T%
* !6&"**\SUSWSW:X1YZFT++Ir:@@BZ\IIh'	OXi((3d(CIIK^`'(.)9;-H
,224 	ICj $(L!#'
#3 		
D
3
 w
 "<	

 DS)
 j)
 	$3
 #DO
 
 wt}
 
 L
 !"4
  
 ;
  !
" !+#
$ "3!%c*;&<!=#&~#6$"!2 01
 	
s. . . . .  	 	 5	 	 	 	h
j  K137 %%JJ K8#qA %%JJKsA  [1Y8 ,Y 1X%2Y 5X+	Y X(Y "Y8 ;Y<Y8 ?A Y?Y8 YY8 [Y&Y',YY8 YEY8 4AY3?6Y8 6I,Y8 #BY8 $[%Y (Y +X>1X42X>9Y 	Y
Y8 YY8 Y8 YY8 Y0#Y&$Y0+Y8 8/['[)[1[[[[[c                   K   d}|/t        j                  dd      }t        j                  |      }d}	 t        ||       }t	        |||      }t        ||       }t        j                  |||d       d{   \  }	}
}t        |	t              rt        j                  d|	       d}	t        |
t              rt        j                  d	|
       d}
t        |t              rt        j                  d
|       d}|| ||	xs i |
xs i |xs i d|r|j                          d{    S S 7 7 # |r|j                          d{  7   w w xY ww)z
    Run all scrapers concurrently for a single business.
    Returns combined data dict with keys: pagespeed, brave, website.
    FN
   )r   limit)	connectorT)return_exceptionszPageSpeed exception: %szBrave exception: %szWebsite exception: %s)rZ   r   r[   	pagespeedbravewebsite)r   TCPConnectorClientSessionrY   r   r0  r7   gather
isinstancerC   r=   r?   close)r   rZ   r[   r   close_sessionr4  pagespeed_task
brave_taskwebsite_taskpagespeed_data
brave_datawebsite_datas               r   
scrape_allrE    sJ     M((U"=	'')<!"(#6'I
+GS99@J":
 4
0
L ni0NN4nE!Nj),NN0*=JlI.NN2LAL + '-2%2#)r
 --/!! 54
6 " --/!! sN   4EAD- 8D)9BD- E"D+#E)D- +E-EEEE)	Australia)rF  N)__doc__r7   r@   r   r   loggingtypingr   urllib.parser   r   r   bs4r   	getLogger__name__r=   r   r   r   r;   
USER_AGENTr   r  r   r:  dictrY   r   r0  rE  r   r   r   <module>rP     sn     
 	   -  			8	$
 2''''b1)G))3 9 
 
$  Z(""Z(),Z(d^Z(F  c""cc c d^	cRMK""MK),MKd^MKl  /3	1"	1"1" 1" g++,	1"
 
1"r   