
    /i%                         d dl mZmZ d dlmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ d dlZd dlmZ  ej"                  d	      Zd
ededefdZ G d de      Z G d de      Zy)    )ABCabstractmethod)OptionalDictAnyTuple   )MarkdownGenerationResult)CustomHTML2Text)RelevantContentFilterN)urljoinz+!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)baseurlreturnc                     |j                  d      r|S |j                  d      r| j                  d      r| dd |z   S | |z   S t        | |      S )z"Fast URL joining for common cases.)http://https://mailto:z///N)
startswithendswithr   )r   r   s     a/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/crawl4ai/markdown_generation_strategy.pyfast_urljoinr      sS    
~~>?

~~c==9s?"cz4    c                       e Zd ZdZ	 	 	 	 ddee   deeeef      de	defdZ
e	 	 	 	 dded	ed
eeeef      dee   de	defd       Zy)MarkdownGenerationStrategyz7Abstract base class for markdown generation strategies.Ncontent_filteroptionsverbosecontent_sourcec                 D    || _         |xs i | _        || _        || _        y )N)r   r   r    r!   )selfr   r   r    r!   s        r   __init__z#MarkdownGenerationStrategy.__init__   s&     -}",r   
input_htmlbase_urlhtml2text_options	citationsr   c                      y)z/Generate markdown from the selected input HTML.N )r#   r%   r&   r'   r   r(   kwargss          r   generate_markdownz,MarkdownGenerationStrategy.generate_markdown)   s     	r   )NNFcleaned_html) NNT)__name__
__module____qualname____doc__r   r   r   strr   boolr$   r   r
   r,   r*   r   r   r   r      s    A ;?,0,
- !67
- $sCx.)
- 	
-
 
-  6::>

 
 $DcN3	

 !!67
 
 
"
 
r   r   c                        e Zd ZdZ	 	 	 ddee   deeeef      def fdZ		 ddedede
eef   fd	Z	 	 	 	 	 dd
ededeeeef      deeeef      dee   dedefdZ xZS )DefaultMarkdownGeneratora  
    Default implementation of markdown generation strategy.

    How it works:
    1. Generate raw markdown from cleaned HTML.
    2. Convert links to citations.
    3. Generate fit markdown if content filter is provided.
    4. Return MarkdownGenerationResult.

    Args:
        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
        content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".

    Returns:
        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
    r   r   r!   c                 ,    t         |   ||d|       y )NF)r    r!   )superr$   )r#   r   r   r!   	__class__s       r   r$   z!DefaultMarkdownGenerator.__init__J   s     	%P^_r   markdownr&   r   c                 @   i }i }g }d}d}t         j                  |      D ]  }|j                  |||j                                 |j	                         \  }	}
}|r)|
j                  d      s|
|vrt        ||
      ||
<   ||
   }
|
|vrOg }|r|j                  |       |	r|	|k7  r|j                  |	       ||rddj                  |      z   ndf||
<   |dz  }||
   d   }|j                  |j                  d      j                  d      s|	 d| d	nd
|	 d| d       |j                         } |j                  ||d        dj                  |      }dg}|j                  d t        |j                         d       D               |dj                  |      fS )a  
        Convert links in markdown to citations.

        How it works:
        1. Find all links in the markdown.
        2. Convert links to citations.
        3. Return converted markdown and references markdown.

        Note:
        This function uses a regex pattern to find links in markdown.

        Args:
            markdown (str): Markdown text.
            base_url (str): Base URL for URL joins.

        Returns:
            Tuple[str, str]: Converted markdown and references markdown.
        r   r	   )r   r   r   z: z - r.   !   ⟨u   ⟩z![u   ⟩]Nz

## References

c              3   >   K   | ]  \  }\  }}d | d| | d  yw)r=   u   ⟩ 
Nr*   ).0r   numdescs       r   	<genexpr>zFDefaultMarkdownGenerator.convert_links_to_citations.<locals>.<genexpr>   s3      
 [c4 #d3%vR(
   c                     | d   d   S )Nr	   r   r*   )xs    r   <lambda>zEDefaultMarkdownGenerator.convert_links_to_citations.<locals>.<lambda>   s    1Q4PQ7 r   )key)LINK_PATTERNfinditerappendstartgroupsr   r   joingroupendextendsorteditems)r#   r:   r&   link_map	url_cachepartslast_endcountermatchtextr   titlerB   rA   converted_text
referencess                   r   convert_links_to_citationsz3DefaultMarkdownGenerator.convert_links_to_citationsR   s   * 	!**84 	#ELL(U[[];<$||~D#u /Q Ri'%1(C%@IcNn("KK&DEMKK%!(T$D1A*Ar R13-"CLL{{1~005 &C5$$s3%t,
 yy{H3	#6 	Xhi() ..
 
$*8>>+;AR$S
 	

 rwwz222r   r%   r'   r(   c           	         	 t        |      }ddddddddd}	|r|	j                  |       n;|r|	j                  |       n'| j                  r|	j                  | j                          |j                  di |	 |sd}nt	        |t
              st        |      }	 |j                  |      }
|
j                  d	d
      }
|
}d}|r	 | j                  |
|      \  }}d}d}|s| j                  rK	 |xs | j                  }|j                  |      }dj                  d |D              }|j                  |      }t        |
xs d|xs d|xs d|xs d|xs d      S # t        $ r}dt        |       }
Y d}~d}~ww xY w# t        $ r}|
}dt        |       }Y d}~d}~ww xY w# t        $ r}dt        |       }d}Y d}~d}~ww xY w# t        $ r(}dt        |       }t        ||ddd      cY d}~S d}~ww xY w)a  
        Generate markdown with citations from the provided input HTML.

        How it works:
        1. Generate raw markdown from the input HTML.
        2. Convert links to citations.
        3. Generate fit markdown if content filter is provided.
        4. Return MarkdownGenerationResult.

        Args:
            input_html (str): The HTML content to process (selected based on content_source).
            base_url (str): Base URL for URL joins.
            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
            citations (bool): Whether to generate citations.

        Returns:
            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
        )baseurlr   FT)
body_widthignore_emphasisignore_linksignore_imagesprotect_linkssingle_line_break	mark_codeescape_snobr.   z#Error converting HTML to markdown: Nz    ```z```zError generating citations: r?   c              3   >   K   | ]  }d j                  |        yw)z<div>{}</div>N)format)r@   ss     r   rC   z=DefaultMarkdownGenerator.generate_markdown.<locals>.<genexpr>   s       .67..q1.rD   zError generating fit markdown: )raw_markdownmarkdown_with_citationsreferences_markdownfit_markdownfit_htmlzError in markdown generation: r*   )r   updater   update_params
isinstancer3   handle	Exceptionreplacer^   r   filter_contentrN   r
   )r#   r%   r&   r'   r   r   r(   r+   hdefault_optionsrl   erm   rn   ro   filtered_html	error_msgs                    r   r,   z*DefaultMarkdownGenerator.generate_markdown   sI   <R	1A#( %!&!&%)!$	O !&&'89&&w/&&t||4AOO.o. 

C0 _
N xx
3 (//	5AL ,8#')R 77hO/+ +-L+-M!4!4	'%3%Jt7J7JN$2$A$A*$MM$(II .;H. %M $%88M#:L
 ,)/R(?(E2$7$=2)/R&," C  N!DSVHMN ! R.:+,HQ*Q'R ! '%DSVH#ML$&M'  		8QAI+&(1$& 		s   BG E ,G E? G -A
F% 7#G 	E<$E72G 7E<<G ?	F"FG F""G %	G.G>G GG 	G<G71G<7G<)NNr-   )r.   )r.   NNNT)r/   r0   r1   r2   r   r   r   r3   r   r$   r   r^   r4   r
   r,   __classcell__)r9   s   @r   r6   r6   7   s    ( ;?,0,	` !67` $sCx.)` 	` .0@3@3'*@3	sCx@3J 6:,0:>pp p $DcN3	p
 $sCx.)p !!67p p 
"pr   r6   )abcr   r   typingr   r   r   r   modelsr
   	html2textr   content_filter_strategyr   reurllib.parser   compilerI   r3   r   r   r6   r*   r   r   <module>r      se    # - - , & : 	   rzzHI	s 	 	 	 :M9 Mr   