o
    Dh                     @   s   d Z ddlZddlmZmZ ddlmZmZmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ d
e
de
fddZdedeeeef fddZddededefddZdS )z<
Module regrouping baseline and basic extraction functions.
    N)AnyTuple)_ElementElement
SubElement)HtmlElement   )BASIC_CLEAN_XPATH)	load_htmltrim)delete_elementtreereturnc                 C   s   t | D ]}t| q| S )z-Remove a few section types from the document.)r	   r   )r   elem r   U/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/baseline.pybasic_cleaning   s   
r   filecontentc              	   C   sV  t | }td}|du r|ddfS d}|dD ]M}|jred|jv rezt|jdd}W n ty9   d}Y nw |red|v rQt |}|durNt|	 nd}nt|}|t
|d_||rbd	| n|7 }qt|d
krs||t|fS t|}d}|dD ]}t|	 }t|d
kr|t
|d_||rd	| n|7 }q~t|dkr||t|fS t }	d}|ddddddD ]!}
t|
	 }||	vr|t
|d_||rd	| n|7 }|	| qt|d
kr||t|fS td}|d}|durt
|d}dd | D }ddd |D |_||jt|jfS t|dd}|t
|d_||t|fS )a)  Use baseline extraction function targeting text paragraphs and/or JSON metadata.

    Args:
        filecontent: HTML code as binary string or string.

    Returns:
        A LXML <body> element containing the extracted paragraphs,
        the main text as string, and its length as integer.

    bodyN r   z&.//script[@type="application/ld+json"]articleBodyz<p>p d   z
.//article
blockquotecodepreqquote.//bodyc                 S   s   g | ]}t |qS r   )r   .0er   r   r   
<listcomp>^       zbaseline.<locals>.<listcomp>
c                 S   s   g | ]}|r|qS r   r   r    r   r   r   r#   _   r$   F)clean)r
   r   iterfindtextjsonloadsget	Exceptionr   text_contentr   lenr   setiteraddfinditertextjoinhtml2txt)r   r   postbody	temp_textr   	json_bodyparsedr(   article_elemresultselemententry	body_elemp_elem
text_elemsr   r   r   baseline   sj   




rA   Tcontentr&   c                 C   sL   t | }|du r
dS |d}|du rdS |rt|}d|   S )zRun basic html2txt on a document.

    Args:
        content: HTML document as string or LXML element.
        clean: remove potentially undesirable elements.

    Returns:
        The extracted text in the form of a string or an empty string.

    Nr   r   r   )r
   r2   r   r4   r-   splitstrip)rB   r&   r   r   r   r   r   r5   h   s   
r5   )T)__doc__r)   typingr   r   
lxml.etreer   r   r   	lxml.htmlr   settingsr	   utilsr
   r   xmlr   r   strintrA   boolr5   r   r   r   r   <module>   s    O