U
    /@e",                     @   s8  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlmZ dd	lmZmZmZmZmZmZ eed
eedddZeed
eedddZeed
eee dddZeed
eedddZ eed
eedddZ!eed
eedddZ"eed
eedddZ#eed
eedddZ$eed
eedddZ%eed
eedddZ&eed
eedd d!Z'eed
eedd"d#Z(eed
eedd$d%Z)eed
eedd&d'Z*ee+ed
eed(d)d*Z,eed
eedd+d,Z-dOe.e/ee d.d/d0Z0ed1d
eed2d3d4Z1e.eee e.f d5d6d7Z2eed8d9d:Z3dPeeed<d=d>Z4eee d?d@dAZ5eee6dBdCdDZ7eeedBdEdFZ8dGej9dHfee/eddIdJdKZ:dQe.ee;e/eee.eee eeddf dL
dMdNZ<dS )R    N)IncrementalDecoder)aliases)	lru_cache)findall)	GeneratorListOptionalSetTupleUnion)MultibyteIncrementalDecoder   )ENCODING_MARKSIANA_SUPPORTED_SIMILARRE_POSSIBLE_ENCODING_INDICATIONUNICODE_RANGES_COMBINEDUNICODE_SECONDARY_RANGE_KEYWORDUTF8_MAXIMAL_ALLOCATION)maxsize)	characterreturnc                 C   sV   zt | }W n tk
r$   Y dS X d|kpTd|kpTd|kpTd|kpTd|kpTd|kS )NFz
WITH GRAVEz
WITH ACUTEzWITH CEDILLAzWITH DIAERESISzWITH CIRCUMFLEXz
WITH TILDEunicodedataname
ValueErrorr   description r   </tmp/pip-unpacked-wheel-0qte9pxs/charset_normalizer/utils.pyis_accentuated   s    r   c                 C   s.   t | }|s| S |d}tt|d dS )N r      )r   decompositionsplitchrint)r   Z
decomposedcodesr   r   r   remove_accent&   s
    

r'   c                 C   s.   t | }t D ]\}}||kr|  S qdS )zK
    Retrieve the Unicode range official name from a single character.
    N)ordr   items)r   Zcharacter_ord
range_nameZ	ord_ranger   r   r   unicode_range1   s
    
r+   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZLATINr   r   r   r   r   is_latin?   s
    r,   c                 C   s2   t | }d|krdS t| }|d kr*dS d|kS )NPTFZPunctuationr   categoryr+   r   character_categorycharacter_ranger   r   r   is_punctuationH   s    
r3   c                 C   sB   t | }d|ksd|krdS t| }|d kr2dS d|ko@|dkS )NSNTFZFormsZLor.   r0   r   r   r   	is_symbolW   s    
r6   c                 C   s$   t | }|d krdS d|kp"d|kS )NFZ	EmoticonsZPictographs)r+   )r   r2   r   r   r   is_emoticonf   s    r7   c                 C   s.   |   s| dkrdS t| }d|kp,|dkS )N>      ｜+<>TZ>   PoPcPd)isspacer   r/   )r   r1   r   r   r   is_separatorp   s    
rA   c                 C   s   |   |  kS N)islowerisupperr   r   r   r   is_case_variablez   s    rF   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFCJKr   r   Zcharacter_namer   r   r   is_cjk   s
    rI   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZHIRAGANAr   rH   r   r   r   is_hiragana   s
    rJ   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZKATAKANAr   rH   r   r   r   is_katakana   s
    rK   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZHANGULr   rH   r   r   r   	is_hangul   s
    rL   c                 C   s.   zt | }W n tk
r$   Y dS X d|kS )NFZTHAIr   rH   r   r   r   is_thai   s
    rM   )r*   r   c                    s   t  fddtD S )Nc                 3   s   | ]}| kV  qd S rB   r   ).0keywordr*   r   r   	<genexpr>   s     z-is_unicode_range_secondary.<locals>.<genexpr>)anyr   rP   r   rP   r   is_unicode_range_secondary   s    rS   c                 C   s(   |   dko&|  dko&| dko&| dkS )NFu   ﻿)r@   isprintablerE   r   r   r   is_unprintable   s    
rV       )sequencesearch_zoner   c                 C   s   t | tstt| }tt| dt|| jddd}t|dkrHdS |D ]N}| 	dd}t
 D ]0\}}||kr|    S ||krh|    S qhqLdS )zW
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    Nasciiignoreerrorsr   -_)
isinstancebytes	TypeErrorlenr   r   mindecodelowerreplacer   r)   )rX   rY   Zseq_lenresultsZspecified_encodingencoding_aliasencoding_ianar   r   r   any_specified_encoding   s"    
rk      )r   r   c                 C   s    | dkpt td| jtS )zQ
    Verify is a specific encoding is a multi byte one based on it IANA name
    >	   	utf_16_leutf_16utf_32utf_8	utf_16_beutf_7	utf_8_sig	utf_32_le	utf_32_beencodings.{})
issubclass	importlibimport_moduleformatr   r   )r   r   r   r   is_multi_byte_encoding   s    
r{   )rX   r   c                 C   sJ   t D ]@}t | }t|tr |g}|D ]}| |r$||f    S q$qdS )z9
    Identify and extract SIG/BOM in given sequence.
    )N    )r   r`   ra   
startswith)rX   iana_encodingZmarksmarkr   r   r   identify_sig_or_bom   s    

r   )r~   r   c                 C   s   | dkS )N>   rn   ro   r   )r~   r   r   r   should_strip_sig_or_bom  s    r   T)cp_namestrictr   c                 C   sL   |   dd} t D ]\}}| ||fkr|  S q|rHtd| | S )Nr^   r_   z Unable to retrieve IANA for '{}')rf   rg   r   r)   r   rz   )r   r   ri   rj   r   r   r   	iana_name  s    
r   )decoded_sequencer   c                 C   s4   t  }| D ] }t|}|d kr q
|| q
t|S rB   )setr+   addlist)r   rangesr   r2   r   r   r   
range_scan  s    r   )iana_name_aiana_name_br   c           	      C   s   t | st |rdS td| j}td|j}|dd}|dd}d}tdD ]*}t|g}||||krX|d7 }qX|d S )	Ng        rv   r[   r\   r      r      )r{   rx   ry   rz   r   rangera   re   )	r   r   Z	decoder_aZ	decoder_bZid_aZid_bZcharacter_match_countiZto_be_decodedr   r   r   cp_similarity*  s     



r   c                 C   s   | t ko|t |  kS )z
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    )r   )r   r   r   r   r   is_cp_similarB  s    
r   Zcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s)r   levelformat_stringr   c                 C   s:   t | }|| t  }|t | || d S rB   )logging	getLoggersetLevelStreamHandlersetFormatter	Formatter
addHandler)r   r   r   loggerhandlerr   r   r   set_logging_handlerM  s
    

r   )
	sequencesrj   offsets
chunk_sizebom_or_sig_availablestrip_sig_or_bomsig_payloadis_multi_byte_decoderdecoded_payloadr   c	                 c   s*  |r6|dkr6|D ]"}	||	|	|  }
|
s, q4|
V  qn|D ]}	|	| }|t | d krXq:| |	|	|  }|r||dkr||| }|j||rdndd}
|r|	dkrt|d}|r|
d | |krt|	|	d d	D ]H}| || }|r|dkr|| }|j|dd}
|
d | |kr qq|
V  q:d S )
NF   r[   r   r\   r   r!      )rc   re   rd   r   )r   rj   r   r   r   r   r   r   r   r   chunkZ	chunk_endZcut_sequenceZchunk_partial_size_chkjr   r   r   cut_sequence_chunksZ  s>    


r   )rW   )T)N)=rx   r   r   codecsr   Zencodings.aliasesr   	functoolsr   rer   typingr   r   r   r	   r
   r   Z_multibytecodecr   Zconstantr   r   r   r   r   r   strboolr   r'   r+   r,   r3   r6   r7   rA   rF   rI   rJ   rK   rL   rM   rc   rS   rV   ra   r%   rk   r{   r   r   r   r   floatr   r   INFOr   r   r   r   r   r   r   <module>   s     

							
  