o
    j                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlm	Z	 dd	l
mZ dd
lmZmZ ddlmZ G dd deZdS )z+
A wrapper of PyMuPDF Page as page engine.
    N   )RawPage   )ImagesExtractor)Paths)FACTOR_A_HALF)Element)RectType
debug_plot)get_areac                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zed
dd Z	dd Z
dS )RawPageFitzz6A wrapper of ``fitz.Page`` to extract source contents.c                 K   s   i }| j s|S | j j^ }}}|||d ||| _| _| jdi |}||d< | jdi |}|d | | jdi |\}}	||d< |d |	 | 	 }
|d |
 t
| j j |S )N)widthheightblocksshapes )page_enginerectupdater   r   _preprocess_text_preprocess_imagesextend_preprocess_shapes_preprocess_hyperlinksr   Zset_rotation_matrixZrotation_matrix)selfsettingsZraw_dict_whtext_blocksZimage_blocksr   Zimages
hyperlinksr   r   Z/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/pdf2docx/page/RawPageFitz.pyextract_raw_dict   s    
zRawPageFitz.extract_raw_dictc                 K   sL  |d }|dkrt d|d}| jjddtjB tjB |d}|dg }z| j }W n ty=   t	
d	 g }Y nw |sB|S |d
krKdd }ndd }tt||}dd }	g }
|D ]E}d}|d D ]5}|d D ]*}|D ]!}t|d |d }||	|d  tkr|d |d krd} nqp|r nql|r nqf|s|
| q^|
S )ak  Extract page text and identify hidden text. 
        
        NOTE: All the coordinates are relative to un-rotated page.

            https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
            https://pymupdf.readthedocs.io/en/latest/functions.html#Page.get_texttrace
            https://pymupdf.readthedocs.io/en/latest/textpage.html
        ocrr   z/OCR feature is planned but not implemented yet.sortZrawdictr   )flagsr$   r   zJIgnore hidden text checking due to UnicodeDecodeError in upstream library.r   c                 S   s   | d dkS Ntype   r   spanr   r   r!   <lambda>X       z.RawPageFitz._preprocess_text.<locals>.<lambda>c                 S   s   | d dkS r&   r   r)   r   r   r!   r+   Z   r,   c                 S   s   | \}}}}|| ||  S )Nr   )bboxZx0Zy0x1y1r   r   r!   	span_area]   s   z/RawPageFitz._preprocess_text.<locals>.span_areaFlinesspansr-   ZfontT)
SystemExitgetr   Zget_textfitzZTEXT_MEDIABOX_CLIPZTEXT_CID_FOR_UNKNOWN_UNICODEZget_texttraceSystemErrorloggingwarninglistfilterr   r   append)r   r   r#   r$   rawr   r2   fZfiltered_spansr0   r   blockZintersectedliner*   Zfilter_spanZintersected_arear   r   r!   r   3   sT   	




zRawPageFitz._preprocess_textc                 K   s$   |d dkrg S t | j|d S )aD  Extract image blocks. Image block extracted by ``page.get_text('rawdict')`` doesn't 
        contain alpha channel data, so it has to get page images by ``page.get_images()`` and 
        then recover them. Note that ``Page.get_images()`` contains each image only once, i.e., 
        ignore duplicated occurrences.
        r#   r   clip_image_res_ratio)r   r   Zextract_images)r   r   r   r   r!   r   w   s   zRawPageFitz._preprocess_imagesc                 K   s6   | j di |}||d |d |d |d |d S )zGIdentify iso-oriented paths and convert vector graphic paths to pixmap.Zmin_svg_gap_dxZmin_svg_gap_dyZ	min_svg_wZ	min_svg_hr@   Nr   )_init_pathsZto_shapes_and_images)r   r   pathsr   r   r!   r      s   zRawPageFitz._preprocess_shapeszSource Pathsc                 K   s   | j  }t| d|S )z:Initialize Paths based on drawings extracted with PyMuPDF.)parent)r   Zget_cdrawingsr   Zrestore)r   r   Z	raw_pathsr   r   r!   rA      s   
zRawPageFitz._init_pathsc                 C   sH   g }| j  D ]}|d dkrq|tjjt|d |d d q|S )ziGet source hyperlink dicts.

        Returns:
            list: A list of source hyperlink dict.
        kindr   fromuri)r'   r-   rF   )r   Z	get_linksr;   r	   Z	HYPERLINKvaluetuple)r   r    linkr   r   r!   r      s   

z"RawPageFitz._preprocess_hyperlinksN)__name__
__module____qualname____doc__r"   r   r   r   r
   rA   r   r   r   r   r!   r      s    D
r   )rM   r5   r7   r   Zimage.ImagesExtractorr   Zshape.Pathsr   Zcommon.constantsr   Zcommon.Elementr   Zcommon.sharer	   r
   Zcommon.algorithmr   r   r   r   r   r!   <module>   s   