o
    jqP                     @   s
  d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ ddlmZ dd	lmZ eeeejd
Zeg dk s`g de  k r^g dk rded nedejejdd G dd dZG dd deZG dd deZ dS )zPDF to Docx Converter.    N)Pool	cpu_count)perf_counter)AnyStrIOUnion)Document   Page)Pages.)r	      r   )r	         )r	   r      zJ1.19.0 <= PyMuPDF <= 1.23.8, or PyMuPDF>=1.23.16 is required for pdf2docx.z[%(levelname)s] %(message)s)levelformatc                	   @   sh  e Zd ZdZ	d:dededefddZedd	 Zed
d Z	dd Z
edd Zd;dededefddZd;dededefddZdd Zdd Zd<ddZdd Zd efd!d"Zd#efd$d%Zd#efd&d'Zd:d(ed)ed*ed+efd,d-Zd=d)eeee f dededefd.d/Zd;dededefd0d1Zd)ededefd2d3Zed4d5 Z ed6d7 Z!ed8d9 Z"dS )>	Convertera  The ``PDF`` to ``docx`` converter.
    
    * Read PDF file with ``PyMuPDF`` to get raw layout data page by page, including text,
      image, drawing and its properties, e.g. boundary box, font, size, image width, height.
    * Analyze layout in document level, e.g. page header, footer and margin.
    * Parse page layout to docx structure, e.g. paragraph and its properties like indentation, 
      spacing, text alignment; table and its properties like border, shading, merging. 
    * Finally, generate docx with ``python-docx``.
    Npdf_filepasswordstreamc                 C   sP   || _ t|pd| _|s|std|rtj|d| _nt|| _t | _dS )zInitialize fitz object with given pdf file path.

        Args:
            pdf_file (str): pdf file path.
            stream   (bytes): pdf file in memory.
            password (str): Password for encrypted pdf. Default to None if not encrypted.
         z(Either pdf_file or stream must be given.)r   N)	filename_pdfstrr   
ValueErrorfitzr   	_fitz_docr   _pages)selfr   r   r    r    S/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/pdf2docx/converter.py__init__&   s   zConverter.__init__c                 C      | j S N)r   r   r    r    r!   fitz_docA      zConverter.fitz_docc                 C   r#   r$   )r   r%   r    r    r!   pagesD   r'   zConverter.pagesc                 C   s   | j   d S r$   )r   closer%   r    r    r!   r)   H   s    zConverter.closec                 C   s   i ddddddddddd	d
ddddddddddddddddddddddi ddddd d!d"d!d#dd$d%d&d'd(dd)dd*dd+dd,dd-dd.dd/dd0dS )1zDefault parsing parameters.debugFZocrr   ignore_page_errorTmulti_processingr   Zmin_section_heightg      4@Zconnected_border_toleranceg      ?Zmax_border_widthg      @Zmin_border_clearanceg       @Zfloat_image_ignorable_gapg      @Zpage_margin_factor_topZpage_margin_factor_bottomZshape_min_dimensionZmax_line_spacing_ratiog      ?Zline_overlap_thresholdg?Zline_break_width_ratioZline_break_free_space_ratiog?Zline_separate_thresholdZnew_paragraph_free_space_ratiog333333?Zlines_left_aligned_thresholdg      ?Zlines_right_aligned_thresholdZlines_center_aligned_thresholdZclip_image_res_ratiog      @Zmin_svg_gap_dxg      .@Zmin_svg_gap_dyZ	min_svg_wZ	min_svg_hZextract_stream_tableZparse_lattice_tableZparse_stream_tableZdelete_end_line_hyphenraw_exceptionsZlist_not_tabler    r%   r    r    r!   default_settingsK   s   	
 !zConverter.default_settingsr   startendr(   c                 K   s&   |  |||jdi |jdi |S )a2  Parse pages in three steps:
        * open PDF file with ``PyMuPDF``
        * analyze whole document, e.g. page section, header/footer and margin
        * parse specified pages, e.g. paragraph, image and table

        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes to parse. Defaults to None.
            kwargs (dict, optional): Configuration parameters. 
        Nr    )
load_pagesparse_documentparse_pages)r   r/   r0   r(   kwargsr    r    r!   parsev   s   zConverter.parsec                 C   s   t | d | jjr#| jstd| j d| j| js#tdt	| j}| j
dd t|D  | ||||}|D ]}d| j
| _q?| S )a  Step 1 of converting process: open PDF file with ``PyMuPDF``, 
        especially for password encrypted file.
        
        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes to parse. Defaults to None.
        z[1/4] Opening document...zRequire password for r   zIncorrect password.c                 S      g | ]}t |d dqS T)idskip_parsingr
   .0ir    r    r!   
<listcomp>       z(Converter.load_pages.<locals>.<listcomp>F)logginginfo_color_outputr   Z
needs_passr   ConversionExceptionr   Zauthenticatelenr   resetrange_page_indexesr9   )r   r/   r0   r(   numpage_indexesr<   r    r    r!   r1      s   	
zConverter.load_pagesc                 K   s*   t | d | jj| jfi | | S )zjStep 2 of converting process: analyze whole document, e.g. page section,
        header/footer and margin.z[2/4] Analyzing document...)r?   r@   rA   r   r5   r&   )r   r4   r    r    r!   r2      s   zConverter.parse_documentc                 K   s   t | d dd | jD }t|}t|ddD ]L\}}|jd }t d||| z
|jdi | W q tyf } z%|d rB |d sR|d	 rRt 	d
|| n
t
d| d| W Y d}~qd}~ww | S )zKStep 3 of converting process: parse pages, e.g. paragraph, image and table.z[3/4] Parsing pages...c                 S   s   g | ]}|j s|qS r    )r9   r;   pager    r    r!   r=      s    z)Converter.parse_pages.<locals>.<listcomp>r	   r/   (%d/%d) Page %dr-   r*   r+   z,Ignore page %d due to parsing page error: %szError when parsing page : Nr    )r?   r@   rA   r   rC   	enumerater8   r5   	ExceptionerrorrB   )r   r4   r(   	num_pagesr<   rJ   pider    r    r!   r3      s$   
zConverter.parse_pagesc           
      K   s8  t | d ttdd | j}|std|s:| jr6| jdtd   d}t	j
|r5t	| ntdt }t|}t|d	d
D ]M\}}|jsOqG|jd	 }t d||| z|| W qG ty }	 z%|d rp |d s|d rt d||	 n
td| d|	 W Y d}	~	qGd}	~	ww || dS )zStep 4 of converting process: create docx file with converted pages.
        
        Args:
            filename_or_stream (str, file-like): docx file to write.
            kwargs (dict, optional): Configuration parameters.
        z[4/4] Creating pages...c                 S   r#   r$   )	finalized)rJ   r    r    r!   <lambda>   s    z%Converter.make_docx.<locals>.<lambda>z)No parsed pages. Please parse page first.r   z.pdfz.docxz?Please specify a docx file name or a file-like object to write.r	   rK   rL   r-   r*   r+   z+Ignore page %d due to making page error: %szError when make page rM   N)r?   r@   rA   listfilterr   rB   r   rC   ospathexistsremover   rN   rT   r8   	make_docxrO   rP   MakedocxExceptionsave)
r   Zfilename_or_streamr4   Zparsed_pagesZ	docx_filerQ   r<   rJ   rR   rS   r    r    r!   r\      s<   

	zConverter.make_docxc                 C   s(   t j| jt| jdd | jD dS )z"Store parsed pages in dict format.c                 S   s   g | ]	}|j r| qS r    )rT   storerI   r    r    r!   r=      s    z#Converter.store.<locals>.<listcomp>)filenamepage_cntr(   )rX   rY   basenamer   rC   r   r%   r    r    r!   r_      s   zConverter.storedatac                 C   s^   | j s|dd}| j dd t|D  |dg D ]}|dd}| j | | qdS )	z"Restore pages from parsed results.ra   d   c                 S   r6   r7   r
   r:   r    r    r!   r=     r>   z%Converter.restore.<locals>.<listcomp>r(   r8   N)r   getrD   rE   restore)r   rc   rG   Zraw_pageidxr    r    r!   rg      s   zConverter.restorer`   c                 C   sL   t |ddd}|tj|  dd W d   dS 1 sw   Y  dS )z*Write parsed pages to specified JSON file.wzutf-8)encoding   )indentN)openwritejsondumpsr_   )r   r`   fr    r    r!   	serialize  s   "zConverter.serializec                 C   sB   t |d}t|}W d   n1 sw   Y  | | dS )z+Load parsed pages from specified JSON file.rN)rm   ro   loadrg   )r   r`   rq   rc   r    r    r!   deserialize  s   zConverter.deserializer<   docx_filename	debug_pdflayout_filec                 K   sx   t j| j\}}|st j|d| }|st j|d}|dt |d | j|fd|gi| | 	| dS )a  Parse, create and plot single page for debug purpose.
        
        Args:
            i (int): Page index to convert.
            docx_filename (str): docx filename to write to.
            debug_pdf (str): New pdf file storing layout information. Default to add prefix ``debug_``.
            layout_file (str): New json file storing parsed layout data. Default to ``layout.json``.
        Zdebug_zlayout.jsonT)r*   Z	debug_docZdebug_filenamer(   N)
rX   rY   splitr   joinupdater   r   convertrr   )r   r<   rv   rw   rx   r4   rY   r`   r    r    r!   
debug_page  s   zConverter.debug_pagec                 K   s   t  }td| j | j}|| |r|d rtd|d r,| j|||fi | n| j|||fi |j	|fi | tdt  |  dS )a  Convert specified PDF pages to docx file.

        Args:
            docx_filename (str, file-like, optional): docx file to write. Defaults to None.
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes. Defaults to None.
            kwargs (dict, optional): Configuration parameters. Defaults to None.
        
        Refer to :py:meth:`~pdf2docx.converter.Converter.default_settings` for detail of 
        configuration parameters.
        
        .. note::
            Change extension from ``pdf`` to ``docx`` if ``docx_file`` is None.
        
        .. note::
            * ``start`` and ``end`` is counted from zero if ``--zero_based_index=True`` (by default).
            * Start from the first page if ``start`` is omitted.
            * End with the last page if ``end`` is omitted.
        
        .. note::
            ``pages`` has a higher priority than ``start`` and ``end``. ``start`` and ``end`` works only
            if ``pages`` is omitted.

        .. note::
            Multi-processing works only for continuous pages specified by ``start`` and ``end`` only.
        zStart to convert %sr,   zPMulti-processing works for continuous pages specified by "start" and "end" only.zTerminated in %.2fs.N)
r   r?   r@   r   r.   r{   rB   _convert_with_multi_processingr5   r\   )r   rv   r/   r0   r(   r4   t0settingsr    r    r!   r|   8  s   
$zConverter.convertc                 K   sV   | j }|| | j|||fi | g }| jD ]}|jr(||jdi | q|S )a  Extract table contents from specified PDF pages.

        Args:
            start (int, optional): First page to process. Defaults to 0, the first page.
            end (int, optional): Last page to process. Defaults to None, the last page.
            pages (list, optional): Range of page indexes. Defaults to None.
            kwargs (dict, optional): Configuration parameters. Defaults to None.
        
        Returns:
            list: A list of parsed table content.
        Nr    )r.   r{   r5   r   rT   extendextract_tables)r   r/   r0   r(   r4   r   ZtablesrJ   r    r    r!   r   h  s   

zConverter.extract_tablesc           	         s   d rt d t nt  d fddt D }t }|j|d t D ]} d| d}tj|sAq0	| t
| q0j|fi  dS )	zParse and create pages based on page indexes with multi-processing.

        Reference:

            https://pymupdf.readthedocs.io/en/latest/faq.html#multiprocessing
        r   r(   c                    s0   g | ]}| j j d | dfqS )-.json)r   r   r:   cpur0   r4   prefixr   r/   r    r!   r=     s
    z<Converter._convert_with_multi_processing.<locals>.<listcomp>r	   r   r   N)minr   rE   r   map_parse_pages_per_cpurX   rY   rZ   ru   r[   r\   )	r   rv   r/   r0   r4   Zvectorspoolr<   r`   r    r   r!   r~     s   
z(Converter._convert_with_multi_processingc                    s   | \}}}}}}}}t ||}	|	  |pt|	j}t|| t }
t|
| }|
| }|t||k  }|d | t|| d }t|| |
} fddt||D }|	jD ]}d|_qX|D ]}d|	j| _q`|	j	di |j
di || |	  dS )	a  Render a page range of a document.
        
        Args:
            vector (list): A list containing required parameters.
                * 0  : segment number for current process                
                * 1  : count of CPUs
                * 2,3: whole pages range to process
                * 4  : pdf filename
                * 5  : password for encrypted pdf
                * 6  : configuration parameters
                * 7  : json filename storing parsed results
        r	   r   c                    s   g | ]} | qS r    r    r:   Zall_indexesr    r!   r=         z2Converter._parse_pages_per_cpu.<locals>.<listcomp>TFNr    )r   r1   rC   r&   rE   intr   r(   r9   r2   r3   rr   r)   )Zvectorrh   r   srS   Zpdf_filenamer   r4   Zjson_filenameZcvrQ   mnZseg_sizeZseg_fromZseg_torH   rJ   r<   r    r   r!   r     s,   

zConverter._parse_pages_per_cpuc                 C   s@   |rdd |D }|S |p|}t t| t|}t|| }|S )zParsing arguments.c                 S   s   g | ]}t |qS r    )r   )r;   xr    r    r!   r=     r   z+Converter._page_indexes.<locals>.<listcomp>)slicer   rE   )r/   r0   r(   Zpdf_lenZindexesr   r    r    r!   rF     s   zConverter._page_indexesc                 C   s   d|  dS )Nz[1;36mz[0mr    )msgr    r    r!   rA     s   zConverter._color_output)NNN)r   NNr$   )Nr   NN)#__name__
__module____qualname____doc__r   bytesr"   propertyr&   r(   r)   r.   r   rV   r5   r1   r2   r3   r\   r_   dictrg   rr   ru   r}   r   r   r   r|   r   r~   staticmethodr   rF   rA   r    r    r    r!   r      sH    



*	
1	(0
.
r   c                   @      e Zd ZdS )rB   Nr   r   r   r    r    r    r!   rB         rB   c                   @   r   )r]   Nr   r    r    r    r!   r]     r   r]   )!r   ro   r?   rX   multiprocessingr   r   timer   typingr   r   r   r   Zdocxr   Z	page.Pager   Z
page.Pagesr   rV   r   r   ZVersionBindry   v
SystemExitbasicConfigINFOr   rO   rB   r]   r    r    r    r!   <module>   s4    &   E