U
    hr2eH9                  
   @   s@  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl m"Z" d dl m#Z# d dl$m%Z% dZ&eddk	Z'e'r:d dl(m)Z) eddk	Z*e*rTd dl+Z,dZ-e .dZ/dZ0dddd d!d"d#d$d%d&h
Z1d'e2fd(e2fd)e2fd*e3fd+e4fd,e3fd-Z5e#d.Z6e#d/Z7G d0d1 d1Z8G d2d3 d3e9Z:G d4d5 d5e;Z<G d6d7 d7e9Z=G d8d9 d9e9Z>G d:d; d;e9Z?d<d= Z@edpd>d?ZAd@dA ZBdBdC ZCdDdE ZDdFdG ZEedHdI ZFdqdKdLZGdrdNdOZHdsdQdRZIdSdT ZJdUdV ZKdWdX ZLeBdtdYdZZMeBd[d\ ZNddMd e8jOd fd]d^ZPdud`daZQdvdbdcZRddMd e8jOd fdddeZSdwdfdgZTddMd e8jOd dfdhdiZUdjdMd e8jOd fdkdlZVdmdn ZWeXdokr<eYeW  dS )x    N)contextmanager)
QUOTE_NONE)ENOENT)wraps)iglob)BytesIO)environ)extsep)linesep)remove)normcase)normpath)realpath)find_loader)NamedTemporaryFile)sleep)InvalidVersion)parse)Version)ImageZ	tesseractnumpy)ndarraypandaszutf-8z	^[a-z_]+$ZRGBZJPEGZJPEG2000PNGZPBMZPGMZPPMZTIFFZBMPZGIFZWEBPZpage_numZorientationrotateZorientation_confscriptZscript_conf)zPage numberzOrientation in degreesZRotatezOrientation confidenceZScriptzScript confidencez3.05z4.1.0c                   @   s   e Zd ZdZdZdZdZdS )Outputbytesz
data.framedictstringN)__name__
__module____qualname__BYTES	DATAFRAMEDICTSTRING r'   r'   ;/tmp/pip-unpacked-wheel-6g7bqfn1/pytesseract/pytesseract.pyr   E   s   r   c                       s   e Zd Z fddZ  ZS )PandasNotSupportedc                    s   t  d d S )NzMissing pandas packagesuper__init__self	__class__r'   r(   r,   M   s    zPandasNotSupported.__init__r    r!   r"   r,   __classcell__r'   r'   r/   r(   r)   L   s   r)   c                   @   s   e Zd Zdd ZdS )TesseractErrorc                 C   s   || _ || _||f| _d S N)statusmessageargs)r.   r5   r6   r'   r'   r(   r,   R   s    zTesseractError.__init__N)r    r!   r"   r,   r'   r'   r'   r(   r3   Q   s   r3   c                       s   e Zd Z fddZ  ZS )TesseractNotFoundErrorc                    s   t  t d d S )NzQ is not installed or it's not in your PATH. See README file for more information.)r+   r,   tesseract_cmdr-   r/   r'   r(   r,   Y   s    zTesseractNotFoundError.__init__r1   r'   r'   r/   r(   r8   X   s   r8   c                       s   e Zd Z fddZ  ZS )TSVNotSupportedc                    s   t  d d S )Nz4TSV output not supported. Tesseract >= 3.05 requiredr*   r-   r/   r'   r(   r,   a   s    zTSVNotSupported.__init__r1   r'   r'   r/   r(   r:   `   s   r:   c                       s   e Zd Z fddZ  ZS )ALTONotSupportedc                    s   t  d d S )Nz6ALTO output not supported. Tesseract >= 4.1.0 requiredr*   r-   r/   r'   r(   r,   h   s    zALTONotSupported.__init__r1   r'   r'   r/   r(   r;   g   s   r;   c                 C   s`   |    zBz| d W n. tk
r4   td Y n tk
rF   Y nX W 5 |   || _X d S )N   )	terminatekill
returncodewait	TypeErrorr   	Exception)processcoder'   r'   r(   r>   n   s    
r>   c                 c   s   zb|s|  d V  W Ld S z| j|d\}}|V  W n( tjk
r^   t| d tdY nX W 5 | j   | j  | j  X d S )Nr<   )timeoutzTesseract process timeout)	stdinclosestdoutstderrcommunicate
subprocessTimeoutExpiredr>   RuntimeError)procseconds_error_stringr'   r'   r(   timeout_manager{   s    



rS   c                    s    t   fdd_S )Nc                     s   j kr | |_ j S r4   )_result)r7   kwargsfuncwrapperr'   r(   rX      s    
zrun_once.<locals>.wrapper)r   rT   )rW   r'   rV   r(   run_once   s    rY   c                 C   s"   d dd | t D  S )N c                 s   s   | ]
}|V  qd S r4   r'   .0liner'   r'   r(   	<genexpr>   s    zget_errors.<locals>.<genexpr>)joindecodeDEFAULT_ENCODING
splitlinesstrip)rR   r'   r'   r(   
get_errors   s    
rd   c                 C   s\   t | r|  dn| D ]@}zt| W q tk
rT } z|jtkrD W 5 d}~X Y qX qdS )z5Tries to remove temp files by filename wildcard path.*N)r   r   OSErrorerrnor   )	temp_namefilenameer'   r'   r(   cleanup   s    
rk   c                 C   s   t rt| trt| } t| tjs,td| js6dn| j}|tkrLtdd|  krt	t
| jd}|| d| d |} || _| |fS )NzUnsupported image objectr   zUnsupported image format/typeA)   rm   rm   )r   r   )numpy_installed
isinstancer   r   Z	fromarrayrA   formatSUPPORTED_FORMATSZgetbandsnewRGB_MODEsizeZpasteZ
getchannel)image	extensionZ
backgroundr'   r'   r(   prepare   s    
rw   c              	   c   s   ztdddv}t| trD|jttt| fV  W 5 Q R  W Nd S t| \} }|j dt	 | }| j
|| jd |j|fV  W 5 Q R X W 5 t |j X d S )NZtess_F)prefixdelete_input)rp   )rk   namer   ro   strr   r   r   rw   r	   saverp   )ru   frv   Zinput_file_namer'   r'   r(   r}      s    
r}   Tc                 C   sf   t jt jd td}tt drHt  |d< |d  jt jO  _t j|d _| rXt j|d< n
t j	|d< |S )N)rG   rJ   startupinfoenvSTARTUPINFOr   rI   )
rL   PIPEr   hasattrr   ZdwFlagsZSTARTF_USESHOWWINDOWZSW_HIDEZwShowWindowDEVNULL)Zinclude_stdoutrU   r'   r'   r(   subprocess_args   s    

r    c              
   C   s   g }t jds*|dkr*|ddt|f7 }|t| |f7 }|d k	rL|d|f7 }|r^|t|7 }|rt|dkrt|| ztj	|ft
 }W n6 tk
r }	 z|	jtkr nt W 5 d }	~	X Y nX t||}
|jrt|jt|
W 5 Q R X d S )Nwin32r   nicez-n-l>   boxtsvosdxml)sysplatform
startswithr|   r9   shlexsplitappendrL   Popenr   rf   rg   r   r8   rS   r?   r3   rd   )input_filenameoutput_filename_baserv   langconfigr   rE   cmd_argsrO   rj   rR   r'   r'   r(   run_tesseract   s&    	

r   Fc                 C   s   t | \}}|||||||d}	tf |	 |	d  t | }
t|
dL}|rp| W  5 Q R  W  5 Q R  S | tW  5 Q R  W  5 Q R  S Q R X W 5 Q R X d S )N)r   r   rv   r   r   r   rE   r   rb)r}   r   r	   openreadr`   ra   )ru   rv   r   r   r   rE   Zreturn_bytesrh   r   rU   ri   Zoutput_filer'   r'   r(   run_and_get_output  s    


 r   c              
      s   i } fdd|   dD }t|dk r0|S |d}t|}t|d |k r`|d d |dk rp||7 }t|D ]|\}}t ||< |D ]d}	t|	|krq||krztt|	| }
W q t	k
r   |	| }
Y qX n|	| }
|| |
 qqx|S )Nc                    s   g | ]}|  qS r'   r   )r\   rowcell_delimiterr'   r(   
<listcomp>*  s     z file_to_dict.<locals>.<listcomp>
   r   rF   r   )
rc   r   lenpopr   	enumeratelistintfloat
ValueError)r   r   Zstr_col_idxresultZrowsheaderlengthiheadr   valr'   r   r(   file_to_dict(  s.    

r   c                 C   sB   |t kr|  S |tkr>zt|  W dS  tk
r<   Y dS X dS )NTF)r   isdigitr   r   )r   _typer'   r'   r(   is_validK  s    r   c                 C   s   dd dd |  dD D S )Nc                 S   sX   i | ]P}t |d krt|d t|d  d rt|d  d t|d  d |d qS )r   r<   r   )r   r   OSD_KEYS)r\   kvr'   r'   r(   
<dictcomp>Z  s
     zosd_to_dict.<locals>.<dictcomp>c                 s   s   | ]}| d V  qdS ): Nr   r[   r'   r'   r(   r^   \  s     zosd_to_dict.<locals>.<genexpr>r   r   )r   r'   r'   r(   osd_to_dictY  s    r   c                 C   s   t dg}| r|t| 7 }ztj|tjtjd}W n tk
rL   t Y nX |j	dkr^t g }|j
r|j
ttD ] }| }t|rz|| qz|S )Nz--list-langs)rI   rJ   )r   r<   )r9   r   r   rL   runr   STDOUTrf   r8   r?   rI   r`   ra   r
   rc   LANG_PATTERNmatchr   )r   r   r   	languagesr]   r   r'   r'   r(   get_languagesa  s(    


r   c               	   C   s   zt jtdgt jtt jd} W n tk
r8   t Y nX | t	}|
tjdd d^}}|d^}}zt|}|tkstW n( ttfk
r   td| dY nX |S )	z9
    Returns Version object of the Tesseract version
    z	--version)rJ   r   rG   
   NrZ   -zInvalid tesseract version: "")rL   check_outputr9   r   r   r   rf   r8   r`   ra   lstripr   	printable	partitionr   TESSERACT_MIN_VERSIONAssertionErrorr   
SystemExit)outputZraw_versionZstr_versionrQ   versionr'   r'   r(   get_tesseract_version~  s$    

r   c                    sD   | d||||g t j fddt j fddt j fddi|  S )zS
    Returns the result of a Tesseract OCR run on the provided image to string
    txtc                      s   t  dg  S NTr   r'   r7   r'   r(   <lambda>      z!image_to_string.<locals>.<lambda>c                      s   dt   iS )Ntextr   r'   r   r'   r(   r     r   c                      s   t   S r4   r   r'   r   r'   r(   r     r   )r   r#   r%   r&   ru   r   r   r   output_typerE   r'   r   r(   image_to_string  s     
 
 
r   pdfc                 C   s0   |dkrt d| | |||||dg}t| S )zU
    Returns the result of a Tesseract OCR run on the provided image to pdf/hocr
    >   r   hocrzUnsupported extension: T)r   r   )ru   r   r   r   rv   rE   r7   r'   r'   r(   image_to_pdf_or_hocr  s    r   c                 C   s8   t  tk rt d|  }| d||||dg}t| S )zU
    Returns the result of a Tesseract OCR run on the provided image to ALTO XML
    z-c tessedit_create_alto=1 r   T)r   TESSERACT_ALTO_VERSIONr;   rc   r   )ru   r   r   r   rE   r7   r'   r'   r(   image_to_alto_xml  s
    
r   c                    sR   |   d}| d||||g tj fddtj fddtj fddi|  S )zR
    Returns string containing recognized characters and their box boundaries
    z batch.nochop makeboxr   c                      s   t  dg  S r   r   r'   r   r'   r(   r     r   z image_to_boxes.<locals>.<lambda>c                      s   t dt   ddS )Nz char left bottom right top page
rZ   r   r   r   r'   r   r'   r(   r     s   c                      s   t   S r4   r   r'   r   r'   r(   r     r   rc   r   r#   r%   r&   r   r'   r   r(   image_to_boxes  s     
 
 
r   c              	   C   sR   t s
t tdd}z|| W n ttfk
r:   Y nX tjtt	|  f|S )N	)quotingsep)
pandas_installedr)   r   updaterA   r   pdZread_csvr   r   )r7   r   rU   r'   r'   r(   get_pandas_output  s    
r   c              
      sr   t  tk rt d|  }| d||||g tj fddtj fddtj fddtj fddi|  S )zt
    Returns string containing box boundaries, confidences,
    and other information. Requires Tesseract 3.05+
    z-c tessedit_create_tsv=1 r   c                      s   t  dg  S r   r   r'   r   r'   r(   r     r   zimage_to_data.<locals>.<lambda>c                      s   t  dg S r   )r   r'   r7   pandas_configr'   r(   r     s   c                      s   t t  ddS )Nr   rF   r   r'   r   r'   r(   r     r   c                      s   t   S r4   r   r'   r   r'   r(   r     r   )	r   r   r:   rc   r   r#   r$   r%   r&   )ru   r   r   r   r   rE   r   r'   r   r(   image_to_data  s    
 
  
 
r   r   c                    sR   d|   }| d||||g tj fddtj fddtj fddi|  S )zN
    Returns string containing the orientation and script detection (OSD)
    z--psm 0 r   c                      s   t  dg  S r   r   r'   r   r'   r(   r   )  r   zimage_to_osd.<locals>.<lambda>c                      s   t t  S r4   )r   r   r'   r   r'   r(   r   *  r   c                      s   t   S r4   r   r'   r   r'   r(   r   +  r   r   r   r'   r   r(   image_to_osd  s     
 
 
r   c               
   C   s  t tjdkr tjd d  } }nFt tjdkrTtjd dkrTtjd tjd  } }ntdtjd dS z*t| }tt||d W 5 Q R X W n tk
r } z"tt	| d	tjd W Y dS d }~X Y nH t
k
r } z(tt|j d
| tjd W Y dS d }~X Y nX d S )Nr   r<      r      z(Usage: pytesseract [-l lang] input_file
)file)r   r   r   )r   r   argvprintrJ   r   r   r   r8   r|   rf   typer    )ri   r   imgrj   r'   r'   r(   main/  s    r   __main__)N)T)r   r   r   )r   Nr   r   r   F)r   )Nr   r   r   r   )Nr   r   r   )N)Zrer   r   rL   r   
contextlibr   csvr   rg   r   	functoolsr   globr   ior   osr   r	   r
   r   os.pathr   r   r   pkgutilr   tempfiler   timer   Zpackaging.versionr   r   r   ZPILr   r9   rn   r   r   r   r   r   ra   compiler   rs   rq   r   r   r|   r   r   r   r   EnvironmentErrorr)   rN   r3   r8   r:   r;   r>   rS   rY   rd   rk   rw   r}   r   r   r   r   r   r   r   r   r&   r   r   r   r   r   r   r   r   r    exitr'   r'   r'   r(   <module>   s   
	


   
(      
#

     
    



!

