o
    DhC                     @   s  d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZ ddlZddlZdd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( zddl)m*Z* ej+,dZ-W n e.y   dZ-Y nw zddl/Z/e/0 Z1e12e/j3e/j4 e12e/j3e/j5 dZ6W n e.y   dZ6Y nw e7e8Z9e:ej;j< da=da>da?dedeej@ef fddZAejBjCddZDded d ZEeEeDd< g dZFh dZGG dd dZHd edeeeeI  eeI f fd!d"ZJ	dZd ed#eeeIeIf  deeIeIf fd$d%ZKd edejBjLfd&d'ZM	d[d ed(eNdeej@ef fd)d*ZOd+eId(eNd,eNd edeeH f
d-d.ZPd+eId/eHd0e#deNfd1d2ZQd+eId/eHd3eNd0e#deeeHeIf  f
d4d5ZRde"dfd+eId(eNd ed0ee# deeI f
d6d7ZSddde"d8d+eId3eNd(eNd,eNd edeeH fd9d:ZTd+eIdeNfd;d<ZUd+eIdeNfd=d>ZVd+eIdeNfd?d@ZW					d\dAeeI dBeeeI  dCeeI dDee dEeNdFeNdefdGdHZX	Id]dDedJeYdeeeI ef fdKdLZZ	Md^dNeeI dOe[dPeeIgef dQe[deeeIef ddf f
dRdSZ\	dZdNeeI dOe[d0ee# deeeIeIf ddf fdTdUZ]	dZdNeeI dOe[d0ee# deeeIeHf ddf fdVdWZ^d+eId(eNd,eNd edeeH f
dXdYZ_dS )_zG
All functions needed to steer and execute downloads of web documents.
    N)ThreadPoolExecutoras_completed)ConfigParser)partial)version)BytesIO)sleep)	AnyCallableDict	GeneratorListOptionalSetTupleUnion)UrlStore)redirection_test   )DEFAULT_CONFIG	Extractor)URL_BLACKLIST_REGEXdecode_fileis_acceptable_lengthmake_chunks)SOCKSProxyManager
http_proxyTFargsreturnc                  K   s8   t rtntj}t rdt ini }d|d< |di || S )zCConfigure urllib3 download pool according to user-defined settings.	proxy_url2   	num_poolsN )	PROXY_URLr   urllib3PoolManager)r   manager_classmanager_argsr"   r"   V/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/downloads.pycreate_poolC   s   r)   )accept_encodingztrafilatura/trafilaturaz( (+https://github.com/adbar/trafilatura)
User-Agent)i  i  i  i  i  i  i  i  i	  i
  i  i  i  i  i  i  iV  >   #   6   :   ;   <   @   B   M   R   S   [   c                   @   s   e Zd ZdZg dZdedededdfdd	Zde	fd
dZ
defddZdeeef ddfddZde	ddfddZdeeef fddZdS )Responsez5Store information gathered in a HTTP response object.dataheadershtmlstatusurlr:   r=   r>   r   Nc                 C   s"   || _ d | _d | _|| _|| _d S Nr9   )selfr:   r=   r>   r"   r"   r(   __init__l   s
   
zResponse.__init__c                 C   s
   | j d uS r?   )r:   r@   r"   r"   r(   __bool__s   s   
zResponse.__bool__c                 C   s   | j pt| jS r?   )r<   r   r:   rB   r"   r"   r(   __repr__v   s   zResponse.__repr__
headerdictc                 C   s   dd |  D | _dS )z#Store response headers if required.c                 S   s   i | ]	\}}|  |qS r"   )lower).0kvr"   r"   r(   
<dictcomp>|       z*Response.store_headers.<locals>.<dictcomp>N)itemsr;   )r@   rE   r"   r"   r(   store_headersy   s   zResponse.store_headersdecodec                 C   s"   |r| j rt| j | _dS dS dS )z9Decode the bytestring in data and store a string in html.N)r:   r   r<   )r@   rN   r"   r"   r(   decode_data~   s   
zResponse.decode_datac                    s    fdd j D S )z,Convert the response object to a dictionary.c                    s   i | ]}|t  |qS r"   )getattr)rG   attrrB   r"   r(   rJ      s    z$Response.as_dict.<locals>.<dictcomp>)	__slots__rB   r"   rB   r(   as_dict   s   zResponse.as_dict)__name__
__module____qualname____doc__rR   bytesintstrrA   boolrC   rD   r   rM   rO   rS   r"   r"   r"   r(   r8   h   s    r8   configc                 C   s<   | j dddd }|r| nd}|  ddpd}||fS )zARead and extract HTTP header strings from the configuration file.DEFAULTUSER_AGENTS )fallbackNCOOKIE)getstrip
splitlines)r\   myagents
agent_listmycookier"   r"   r(   _parse_config   s   rh   r;   c                 C   s>   | t krt| \}}i }|rt||d< |r||d< |ptS )z1Internal function to decide on user-agent string.r,   Cookie)r   rh   randomchoiceDEFAULT_HEADERS)r\   r;   re   rg   r"   r"   r(   _determine_headers   s   rm   c                 C   s:   t stjj| dd| ddd| ddd tda t S )z5Define a retry strategy according to the config file.r]   MAX_REDIRECTSr   DOWNLOAD_TIMEOUT   )totalredirectconnectbackoff_factorstatus_forcelist)RETRY_STRATEGYr$   utilRetrygetintFORCE_STATUSr\   r"   r"   r(   _get_retry_strategy   s   

r|   no_sslc                 C   sN   |rt nt}|s%t| dd|rdnt |rdndd}|r#|a |S |a|S )zXCreate a urllib3 pool manager according to options in the config file and HTTPS setting.r]   ro   N	CERT_NONECERT_REQUIRED)timeoutca_certs	cert_reqs)NO_CERT_POOL	HTTP_POOLr)   ry   certifiwhere)r\   r}   poolr"   r"   r(   _initiate_pool   s   

r   r>   with_headersc           
   
   C   s   zLt ||d}|jd| t|t|dd}t }|dD ]}|| t||ddkr2t	dq|
  tt||j| }|rJ||j |W S  tjjyd   td	|  t| d
|| Y S  ty} }	 ztd| |	 W Y d}	~	dS d}	~	ww )zPInternal function to robustly send a request (SSL or not) and return its result.)r}   GETF)r;   retriespreload_contenti   r]   MAX_FILE_SIZEzMAX_FILE_SIZE exceededzretrying after SSLError: %sTzdownload error: %s %sN)r   requestrm   r|   	bytearraystreamextendlenry   
ValueErrorrelease_connr8   rX   r=   geturlrM   r;   r$   
exceptionsSSLErrorLOGGERwarning_send_urllib_request	Exceptionerror)
r>   r}   r   r\   pool_managerresponser:   chunkresperrr"   r"   r(   r      s8   
r   r   optionsc                 C   sD   t |jp|jpd}|jdkrtd|j|  dS t||s dS dS )z2Check if the response conforms to formal criteria.r_      z!not a 200 response: %s for URL %sFT)r   r<   r:   r=   r   r   r   )r>   r   r   lentestr"   r"   r(   _is_suitable_response   s   

r   rN   c                 C   s   t | ||r|r|jS |S dS )z:Internal function to run safety checks on response result.N)r   r<   )r>   r   rN   r   r"   r"   r(   _handle_response   s   r   c                 C   sL   |r|j n|}t| d||d}|r$|jr$|st|d}t| ||r$|jS dS )a  Downloads a web page and seamlessly decodes the response.

    Args:
        url: URL of the page to fetch.
        no_ssl: Do not try to establish a secure connection (to prevent SSLError).
        config: Pass configuration values for output control.
        options: Extraction options (supersedes config).

    Returns:
        Unicode string or None in case of failed downloads and invalid results.

    T)rN   r}   r\   r{   N)r\   fetch_responser:   r   r   r<   )r>   r}   r\   r   r   r"   r"   r(   	fetch_url  s   

r   )rN   r}   r   r\   c                C   sH   t stnt}td|  || |||}|std|  dS || |S )a  Downloads a web page and returns a full response object.

    Args:
        url: URL of the page to fetch.
        decode: Use html attribute to decode the data (boolean).
        no_ssl: Don't try to establish a secure connection (to prevent SSLError).
        with_headers: Keep track of the response headers.
        config: Pass configuration values for output control.

    Returns:
        Response object or None in case of failed downloads and invalid results.

    zsending request: %szrequest failed: %sN)
HAS_PYCURLr   _send_pycurl_requestr   debugrO   )r>   rN   r}   r   r\   dl_functionr   r"   r"   r(   r   #  s   
r   c              
   C   s   d}t  }|t j| d |t jd |t jd |t jd ||jd t	r5|t j
t	 z|  ||jdk }W n t jy_ } ztd| | d}W Y d}~nd}~ww |  |S )	z+Send a basic HTTP HEAD request with pycurl.Futf-8
   r   Ti  zpycurl HEAD error: %s %sN)pycurlCurlsetoptURLencodeCONNECTTIMEOUTSSL_VERIFYPEERSSL_VERIFYHOSTNOBODYr#   	PRE_PROXYperformgetinfoRESPONSE_CODEr   r   r   close)r>   page_existscurlr   r"   r"   r(   _pycurl_is_live_pageB  s&   r   c              
   C   sD   zt | }W dS  ty! } ztd| | W Y d}~dS d}~ww )zGUse courlan redirection test (based on urllib3) to send a HEAD request.zurllib3 HEAD error: %s %sNFT)r   r   r   r   )r>   _r   r"   r"   r(   _urllib3_is_live_page^  s   
r   c                 C   s   t rt| nd}|pt| S )zCSend a HTTP HEAD request without taking anything else into account.F)r   r   r   )r>   resultr"   r"   r(   is_live_pageh  s   r   	inputlist	blacklist
url_filter	url_storecompressionverbosec                    s^   |du rt |d|d}tt| }  r fdd| D } r(fdd| D } ||  |S )zMFilter, convert input URLs and add them to domain-aware processing dictionaryNF)
compressedstrictr   c                    s    g | ]}t d | vr|qS )r_   )r   sub)rG   u)r   r"   r(   
<listcomp>~  s    z*add_to_compressed_dict.<locals>.<listcomp>c                    s&   g | ] t  fd dD r qS )c                 3   s    | ]}| v V  qd S r?   r"   )rG   fr   r"   r(   	<genexpr>  s    z4add_to_compressed_dict.<locals>.<listcomp>.<genexpr>)any)rG   )r   r   r(   r     s   & )r   listdictfromkeysadd_urls)r   r   r   r   r   r   r"   )r   r   r(   add_to_compressed_dicto  s   	

r         @
sleep_timec                 C   s.   	 | j |dd}|s| jr	 || fS t| q)zRDetermine threading strategy and draw URLs respecting domain-based back-off rules.Ti )
time_limitmax_urls)get_download_urlsdoner   )r   r   
bufferlistr"   r"   r(   load_download_buffer  s   
r   '  r   download_threadsworker	chunksizec                 #   sv    t |d+ t| |D ]} fdd|D }t|D ]}|| | fV  qqW d   dS 1 s4w   Y  dS )z3Use a thread pool to perform a series of downloads.)max_workersc                    s   i | ]	}  ||qS r"   )submit)rG   r>   executorr   r"   r(   rJ     rK   z'_buffered_downloads.<locals>.<dictcomp>N)r   r   r   r   )r   r   r   r   r   future_to_urlfuturer"   r   r(   _buffered_downloads  s   "r   c                 C   s   t t|d}t| ||S )z3Download queue consumer, single- or multi-threaded.)r   )r   r   r   )r   r   r   r   r"   r"   r(   buffered_downloads  s   r   c                 C   s&   |r|j nt}tt|d}t| ||S )z7Download queue consumer, returns full Response objects.r{   )r\   r   r   r   r   )r   r   r   r\   r   r"   r"   r(   buffered_response_downloads  s   r   c              
   C   s*  dd t | D }t }|tj| d |tjt |tj	| |tj
d |tj|dd |tj|dd |tj|dd |tj|dd |tjd |d	u rt|tjd
 |tjd
 n	|tjt  |rt }|tj|j tr|tjt z| }W n: tjy } z-td| | |du r|jd
 tv rt d| | t!| d	||W  Y d}~S W Y d}~dS d}~ww t"||#|j$|#|j%}	|&  |ri }
|' j(ddd) D ]}d|vrq|*dd\}}|+ |
|+ < q|	,|
 |	S )zDExperimental function using libcurl and pycurl to speed up downloadsc                 S   s   g | ]\}}| d | qS )z: r"   )rG   headercontentr"   r"   r(   r     s    z(_send_pycurl_request.<locals>.<listcomp>r   r   r]   rn   ro   r   Tr   zpycurl error: %s %sFzretrying after SSL error: %s %sNz
iso-8859-1replace)errors:)-rm   rL   r   r   r   r   r   SHARE
CURL_SHARE
HTTPHEADERFOLLOWLOCATION	MAXREDIRSry   r   TIMEOUTMAXFILESIZENOSIGNALr   r   CAINFOr   r   r   HEADERFUNCTIONwriter#   r   
perform_rbr   r   r   CURL_SSL_ERRORSr   r   r8   r   r   EFFECTIVE_URLr   getvaluerN   rd   splitrc   rM   )r>   r}   r   r\   
headerlistr   headerbytesbufferbytesr   r   respheaderslinenamevaluer"   r"   r(   r     s\   

r   r?   )F)NNNFF)r   )r   )`rW   loggingosrj   concurrent.futuresr   r   configparserr   	functoolsr   importlib.metadatar   ior   timer   typingr	   r
   r   r   r   r   r   r   r   r   r$   courlanr   courlan.networkr   settingsr   r   utilsr   r   r   r   urllib3.contrib.socksr   environrb   r#   ImportErrorr   	CurlSharer   r   SH_SHARELOCK_DATA_DNSLOCK_DATA_SSL_SESSIONr   	getLoggerrT   r   disable_warningsr   InsecureRequestWarningr   r   rv   r%   r)   rw   make_headersrl   
USER_AGENTrz   r   r8   rZ   rh   rm   rx   r|   r[   r   r   r   r   r   r   r   r   r   r   floatr   rY   r   r   r   r   r"   r"   r"   r(   <module>   s~  ,
&"



%



	





