
    iN                     f    d Z ddlmZ ddlmZ ddlmZmZmZ ddl	m
Z
 e
 G d de             ZdgZy)	z
Processor class for MarkupLM.
   )
TensorType)ProcessorMixin)BatchEncodingPaddingStrategyTruncationStrategy)auto_docstringc                         e Zd ZdZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedeez  ez  deez  e	z  de
dz  de
d	e
dz  d
edz  dedz  dedededededeez  dz  defd       Z xZS )MarkupLMProcessorTc                 &    t         |   ||       y )N)super__init__)selffeature_extractor	tokenizer	__class__s      l/mnt/e/genesis-system/.venv/lib/python3.12/site-packages/transformers/models/markuplm/processing_markuplm.pyr   zMarkupLMProcessor.__init__   s    *I6    Nadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                    | j                   r:|t        d      |||t        d      | j                  |      }|d   }|d   }n|t        d      ||t        d      || j                   rt        |t              r|g} | j
                  di d||n|d	||ndd|d
|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d||}|S )a  
        html_strings (`str` or `list[str]`, *optional*):
            Raw HTML strings to parse and process. When `parse_html=True` (default), these strings are parsed
            to extract nodes and xpaths automatically. If provided, `nodes`, `xpaths`, and `node_labels` should
            not be provided. Required when `parse_html=True`.
        nodes (`list[list[str]]`, *optional*):
            Pre-extracted HTML nodes as a list of lists, where each inner list contains the text content of nodes
            for a single document. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
        xpaths (`list[list[str]]`, *optional*):
            Pre-extracted XPath expressions corresponding to the nodes. Should be a list of lists with the same
            structure as `nodes`, where each XPath identifies the location of the corresponding node in the HTML
            tree. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
        node_labels (`list[list[int]]`, *optional*):
            Labels for the nodes, typically used for training or fine-tuning tasks. Should be a list of lists
            with the same structure as `nodes`, where each label corresponds to a node. Optional and only used
            when `parse_html=False`.
        questions (`str` or `list[str]`, *optional*):
            Question strings for question-answering tasks. When provided, the tokenizer processes questions
            as the first sequence and nodes as the second sequence (text_pair). If a single string is provided,
            it is converted to a list to match the batch dimension of the parsed HTML.
        NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`nodesxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`text	text_pairnode_labelsr   r   r   r   r   r   r   r   r   r   r   r   r    r!    )
parse_html
ValueErrorr   
isinstancestrr   )r   html_stringsr$   r%   r(   	questionsr   r   r   r   r   r   r   r   r   r   r   r   r    r!   kwargsfeaturesencoded_inputss                          r   __call__zMarkupLMProcessor.__call__   s   \ ??# !ghh F$6+:Q k  --l;HW%Eh'F' !cdd} !lmm  T__)S)&K	' 
'3
(4e$
 
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
, r   )NNNNNTFNN    NNNFFFFTN)__name__
__module____qualname__r*   r   r   boolr-   r   r   intr   r   r3   __classcell__)r   s   @r   r
   r
      s0   J7  #'056:!%)--1-1*/+0',#26)Z !Z o-Z 3J!33Z $JZ Z  $JZ  $d{Z  $d{Z $(Z  %)!Z" !%#Z$ %Z& 'Z( j(4/)Z, 
-Z Zr   r
   N)__doc__
file_utilsr   processing_utilsr   tokenization_utils_baser   r   r   utilsr   r
   __all__r)   r   r   <module>rA      sD    % . Y Y # a a aH 
r   