o
    Dh0                     @   s6  d Z ddlmZ ddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZ zddlmZ eedZW n eyD   ddlmZ e pAdZY nw dd	lmZ dd
lmZmZmZ ddlmZ g dZeedhB Z	ddde
e de
e defddZe ZdddddddddZ G dd dZ!deded e
e de!fd!d"Z"dfd$e#deeef fd%d&Z$G d'd( d(Z%e&ed)Z'd*Z(d+Z)d,Z*d-Z+d.Z,h d/Z-g d0Z.g d1Z/ed2Z0e1g d3Z2i d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcZ3dS )gz<
Listing a series of settings that are applied module-wide.
    )ConfigParser)datetime)unescape)AnyDictListOptionalSet)sched_getaffinity)	cpu_count   )Path)_ElementElementXPath)line_processing)csvjsonhtmlmarkdowntxtxmlxmlteipythonNfilenameconfigreturnc                 C   sP   |dur|S | du rt ttjd } n
t|  stdt }||  |S )zE
    Use configuration object or read and parse a settings file.
    Nzsettings.cfgz$The given config file does not exist)strr   __file__parentis_fileFileNotFoundErrorr   read)r   r    r#   U/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/settings.py
use_config   s   
r%   MIN_EXTRACTED_SIZEMIN_OUTPUT_SIZEMIN_OUTPUT_COMM_SIZEMIN_EXTRACTED_COMM_SIZEMIN_DUPLCHECK_SIZEMAX_REPETITIONSMAX_FILE_SIZEMIN_FILE_SIZE)min_extracted_sizemin_output_sizemin_output_comm_sizemin_extracted_comm_sizemin_duplcheck_sizemax_repetitionsmax_file_sizemin_file_sizec                ,   @   s  e Zd ZdZg dZeddddddddddddddddddddded	ed
ededededededededede	e de	e de	e dededede	e
e  de	e
e  de	eeef  f(ddZde	e de	e ddfdd Zd!eddfd"d#Zdeddfd$d%ZdS )&	Extractorz0Defines a class to store all extraction options.)r   formatfastfocuscomments
formattinglinksimagestablesdeduplangr.   r/   r0   r1   r2   r3   r4   r5   max_tree_sizesourceurlwith_metadataonly_with_metadatatei_validationdate_paramsauthor_blacklisturl_blacklistr   FTN)r   output_formatr8   	precisionrecallr:   r;   r<   r=   r>   r?   r@   rC   rB   rD   rE   rF   rH   rI   rG   r   rJ   r8   rK   rL   r:   r;   r<   r=   r>   r?   r@   rC   rB   rD   rE   rF   rH   rI   rG   c                C   s   |  || | | | | || _|rdn|rdnd| _|| _|p'| jdk| _|| _|	| _	|
| _
|| _|| _|| _|| _|| _|pEt | _|pKt | _|pX|pXt|pX|dk| _|pdt| jdd| _d | _d S )NrL   rK   balancedr   r   DEFAULTEXTENSIVE_DATE_SEARCH)_set_source_set_format_add_configr8   r9   r:   r7   r;   r<   r=   r>   r?   r@   rC   rE   rF   setrH   rI   boolrD   set_date_paramsr   
getbooleanrG   rA   )selfr   rJ   r8   rK   rL   r:   r;   r<   r=   r>   r?   r@   rC   rB   rD   rE   rF   rH   rI   rG   r#   r#   r$   __init__e   s:   


zExtractor.__init__r   c                 C   s$   |p|}|o| ddd| _dS )z)Set the source attribute in a robust way.zutf-8replaceN)encodedecoderB   )rW   rC   rB   r#   r#   r$   rP      s   zExtractor._set_sourcechosen_formatc                 C   s*   |t vrtddtt  || _dS )z;Store the format if supported and raise an error otherwise.z#Cannot set format, must be one of: z, N)SUPPORTED_FORMATSAttributeErrorjoinsortedr7   )rW   r\   r#   r#   r$   rQ      s
   
zExtractor._set_formatc                 C   s0   t  D ]\}}t| ||d| q|| _dS )z&Store options loaded from config file.rN   N)CONFIG_MAPPINGitemssetattrgetintr   )rW   r   keyvaluer#   r#   r$   rR      s   
zExtractor._add_config)__name__
__module____qualname____doc__	__slots__DEFAULT_CONFIGr   r   rT   r   r	   r   rX   rP   rQ   rR   r#   r#   r#   r$   r6   ?   s    '	



6r6   argsrC   c                 C   s`   t t| jd| j| j| j| j| j| j| j	| j
|| j| j| jd}dD ]}t||t| | q"|S )z-Derive extractor configuration from CLI args.)r   )r   rJ   r;   rK   rL   r:   r>   r?   r@   rC   rD   rE   rF   )r8   r=   r<   )r6   r%   config_filerJ   r;   rK   rL   no_comments	no_tablesdeduplicatetarget_languagerD   rE   validate_teirc   getattr)rm   rC   optionsattrr#   r#   r$   args_to_extractor   s$   
rw   T	extensivec                 C   s   d| t  ddS )z/Provide default parameters for date extraction.Tz%Y-%m-%d)original_dateextensive_searchmax_date)r   nowstrftime)rx   r#   r#   r$   rU      s   rU   c                ,   @   sB  e Zd ZdZg dZddddddddddddeddedddddddddee dee dee d	ee d
ee dee dee deee  deee  dee dee dee de	dee de	dee dee dee dee dee dee f*ddZ
edeeef dd fddZd$d d!Zdeeee f fd"d#ZdS )%DocumentzZDefines a class to store all necessary data and metadata fields for extracted information.titleauthorrC   hostnamedescriptionsitenamedate
categoriestagsfingerprintidlicensebodyr:   commentsbodyraw_texttextlanguageimagepagetypefiledateNr   )r   r   rC   r   r   r   r   r   r   r   idvallicense_valr   r:   r   r   r   r   r   r   r   r   r   rC   r   r   r   r   r   r   r   r   r   r:   r   r   r   r   r   r   r   c                C   s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S Nr   )rW   r   r   rC   r   r   r   r   r   r   r   r   r   r   r:   r   r   r   r   r   r   r   r#   r#   r$   rX      s*   
zDocument.__init__datar   c                 C   s(   |  }|  D ]
\}}t||| q|S )z.Set a series of attributes using a dictionary.)rb   rc   )clsr   docre   rf   r#   r#   r$   	from_dict  s   zDocument.from_dictc                 C   sX   | j D ]&}t| |}t|tr)t|dkr|dd d }tt|}t| || qdS )z*Limit text length and trim the attributes.'  Ni'  u   …)rk   rt   
isinstancer   lenr   r   rc   )rW   slotrf   r#   r#   r$   clean_and_trim!  s   


zDocument.clean_and_trimc                    s    fdd j D S )z%Convert the document to a dictionary.c                    s   i | ]	}|t  |d qS r   )rt   ).0rv   rW   r#   r$   
<dictcomp>/  s    z$Document.as_dict.<locals>.<dictcomp>)rk   r   r#   r   r$   as_dict-  s   zDocument.as_dict)r   N)rg   rh   ri   rj   rk   r   r   r   r   r   rX   classmethodr   r   r   r   r   r#   r#   r#   r$   r~      s    	



/
r~      i   i     i@B r   >   bipqdddtemh1h2h3h4h5h6lidivpremainspanstrongarticlesection
blockquote)3asideembedfooterformheadiframemenuobjectscriptappletaudiocanvasfiguremappicturesvgvideoareablinkbuttondatalistdialogframeframesetfieldsetlinkinputinslabellegendmarqueemathmenuitemnavnoindexnoscriptoptgroupoptionoutputparamprogressrprtrtcselectrB   styletracktextareatimeuse)abbracronymaddressbdibdobigciter   dfnfonthgroupimgr   markmetarubysmalltbodytemplatetfoottheadzL.//aside|.//div[contains(@class|@id, 'footer')]|.//footer|.//script|.//style)
r   codedelr   hilblistr   r   quotearArabicbg	BulgarianczCzechdaDanishdeGermanenEnglishelGreekesSpanishfaPersianfiFinnishfrFrenchhrCroatianhu	HungariankoKoreanr   
IndonesianitItaliannoNorwegian_NynorskDutchPolish
PortugueseRomanianRussianSlovak	SlovenianSerbianSwedishTurkish	UkrainianUrdu
Vietnamese)nlplptroruskslsrsvtrukurvi)NNr   )T)4rj   configparserr   r   r   r   typingr   r   r   r   r	   osr
   r   	CPU_COUNTImportErrorr   pathlibr   
lxml.etreer   r   r   utilsr   SUPPORTED_FMT_CLIrS   r]   r   r%   rl   ra   r6   rw   rT   rU   r~   minPARALLEL_CORESLRU_SIZEMAX_FILES_PER_DIRECTORYFILENAME_LEN	MAX_LINKSMAX_SITEMAPS_SEENCUT_EMPTY_ELEMSMANUALLY_CLEANEDMANUALLY_STRIPPEDBASIC_CLEAN_XPATH	frozensetTAG_CATALOGJUSTEXT_LANGUAGESr#   r#   r#   r$   <module>   s   
p

d:	
