o
    jH                     @   sZ   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ G dd dZdS )	a  Extract images from PDF.

Both raster images and vector graphics are considered:

* Normal images like jpeg or png could be extracted with method ``page.get_text('rawdict')`` 
  and ``Page.get_images()``. Note the process for png images with alpha channel.
* Vector graphics are actually composed of a group of paths, represented by operators like
  ``re``, ``m``, ``l`` and ``c``. They're detected by finding the contours with ``opencv``.
    N   )
Collection)	BlockType)recursive_xy_cutinner_contoursxy_project_profilec                   @   s  e Zd ZdZdejddfddZ	d+d	ejd
ede	fddZ
			d+d	ejd
ede	fddZedefddZd,de	fddZde	de	de	de	fddZedejd	ejfddZedejdefd d!Zedejd"ed
efd#d$Zed%ejd&efd'd(Zedejfd)d*ZdS )-ImagesExtractorzExtract images from PDF.pagereturnNc                 C   s
   || _ dS )zoExtract images from PDF page.

        Args:
            page (fitz.Page): pdf page to extract images.
        N)_page)selfr	    r   _/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/pdf2docx/image/ImagesExtractor.py__init__   s   
zImagesExtractor.__init__F      @bboxrm_imagezoomc                 C   s   | j | jd|d}|du r| jj}n| jjr|| jj }n|}| jj|@ }t||}| jj||d}| jj}|	 D ]
\}	}
|
|	|
 q;|S )a  Clip page pixmap according to ``bbox``.

        Args:
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
                Note that ``bbox`` depends on un-rotated page CS, while clipping page is based on
                the final page.
            rm_image (bool): remove images or not.
            zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.

        Returns:
            fitz.Pixmap: The extracted pixmap.
        T)rm_textr   N)Zclipmatrix)_hide_page_text_and_imagesr   rectrotationZrotation_matrixfitzZMatrixZ
get_pixmapparentitemsupdate_stream)r   r   r   r   Zstream_dict	clip_bboxr   pixdocxrefstreamr   r   r   clip_page_to_pixmap   s   
z#ImagesExtractor.clip_page_to_pixmapclip_image_res_ratioc                 C   s   | j |||d}| ||S )a  Clip page pixmap (without text) according to ``bbox`` and convert to source image.

        Args:
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
            rm_image (bool): remove images or not.
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
                Defaults to 3.0.

        Returns:
            list: A list of image raw dict.
        )r   r   r   )r"   _to_raw_dict)r   r   r   r#   r   r   r   r   clip_page_to_dictK   s   z!ImagesExtractor.clip_page_to_dictc                 C   sp   | du rdS z| j | j| j| jf\}}}}W n
 ty    Y dS w t||}tt|d d }t	|d S )zExtract rotation angle (0, 90, 180, 270) from image transform matrix.

        Handles 90-degree multiples only. Based on PyMuPDF Matrix format:
        fitz.Matrix(90) -> (0, 1, -1, 0, ...), Matrix(180) -> (-1, 0, 0, -1, ...).
        Nr   Z   ih  )
abcdAttributeErrormathatan2rounddegreesint)r   r'   r(   r)   r*   Z	angle_radZ	angle_degr   r   r   _get_image_rotationa   s    z#ImagesExtractor._get_image_rotationc              
   C   s  | j j}| j j}t }| j jddD ]p}t|}d|d< |d }| j |}g }z
| j j|dd}W n ttfy=   Y nw | j j	}	t
|D ];\}
}| dkrQqF|	|sWqFd}|
t|k ry||
 }t|ttfryt|dkry|d }| |}||||f qFqd	d
 }||}g }|D ]V}t|dkrt }|D ]	\}}}||O }q| |d|}n1|d \}}}|d dkr| |d|}n| ||}| ||}|pd| }|r| |||d< || q|S )a  Extract normal images with ``Page.get_images()``.

        Args:
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
                Defaults to 3.0.

        Returns:
            list: A list of extracted and recovered image raw dict.

        .. note::
            ``Page.get_images()`` contains each image only once, which may less than the
            real count of images in a page.
        Tfullr   )	transform   r      c                 S   s   | d  |d S )Nr   )
intersects)r'   r(   r   r   r   <lambda>   s    z0ImagesExtractor.extract_images.<locals>.<lambda>F    image)r   r   r   r   
get_imageslistZget_image_rects	TypeErrorr+   Zcropbox	enumerateZget_arear8   len
isinstancetupler1   appendgroupr   Rectr%   _recover_pixmapr$   _rotate_image)r   r#   r   r   Zicitemr    ZrectsZrects_with_transformZunrotated_page_bboxir   Zimage_rotationentryZmatrix_tZfungroupsZimagesrE   r   _Zraw_dictr   Ztotal_rotationr   r   r   extract_imagess   sb   	



zImagesExtractor.extract_imagesmin_svg_gap_dxmin_svg_gap_dymin_wmin_hc              	      sT  ddl }| jddd}| |}|||j}||dd|j\}	 t ||d}
 fd	d
|
D }tt	|
|}d}|rt
|
D ])\}\}}}}t|||||ddf  ||||f }|d| | qC|D ]-\}}|\}}}}||||f||fdd |D ]\}}}}||||f||fdd qqo|d| |d |S )a  Find contour of potential vector graphics.

        Args:
            min_svg_gap_dx (float): Merge svg if the horizontal gap is less than this value.
            min_svg_gap_dy (float): Merge svg if the vertical gap is less than this value.
            min_w (float): Ignore contours if the bbox width is less than this value.
            min_h (float): Ignore contours if the bbox height is less than this value.

        Returns:
            list: A list of potential svg region: (external_bbox, inner_bboxes:list).
        r   NT      ?)r   r         )Zmin_dxZmin_dyc                    s   g | ]	}t  |qS r   )r   ).0r   binaryrR   rQ   r   r   
<listcomp>  s    z7ImagesExtractor.detect_svg_contours.<locals>.<listcomp>Fz
sub-image-)rU   r   r   r7   )r   r   rU   img)cv2r"   _pixmap_to_cv_imageZcvtColorZCOLOR_BGR2GRAY	thresholdZTHRESH_BINARY_INVr   r>   zipr@   r   ZimshowZ	rectangleZwaitKey)r   rO   rP   rQ   rR   cvpixmapsrcgrayrM   Zexternal_bboxesZgrouped_inner_bboxesrL   debugrJ   x0y0x1y1Zarrr   Zinner_bboxesZu0Zv0u1Zv1r   rW   r   detect_svg_contours   s4   
0
z#ImagesExtractor.detect_svg_contoursr<   c                 C   sN   	 | j }|du s|tjtjfvrttj| } tjjt|| j	| j
|  dS )zStore Pixmap ``image`` to raw dict.

        Args:
            image (fitz.Pixmap): Pixmap to store.
            bbox (fitz.Rect): Boundary box the pixmap.

        Returns:
            dict: Raw dict of the pixmap.
        N)typer   widthheightr<   )Z
colorspacer   ZcsGRAYcsRGBPixmapr   ZIMAGEvaluerC   rk   rl   tobytes)r<   r   csr   r   r   r$      s   zImagesExtractor._to_raw_dictr`   r   c                 C   s   ddl }ddl}t| }|jdd \}}|d |d }}d}	|||f||	}
||
d }||
d }t|| ||  }t|| ||  }|
d  |d | 7  < |
d  |d | 7  < |||
||f}|	d	|\}}|
 S )
zRotate image represented by image bytes.

        Args:
            pixmap (fitz.Pixmap): Image to rotate.
            rotation (int): Rotation angle.

        Return: image bytes.
        r   Nr   rS   )r   r   )r   r7   )r   r   )r7   r   z.png)r[   numpyr   r\   shapeZgetRotationMatrix2Dabsr0   Z
warpAffineZimencoderp   )r`   r   r_   nprZ   hwrd   re   scaler   cossinWHZrotated_imgrM   Zim_pngr   r   r   rH   M  s    

zImagesExtractor._rotate_imager   c                    s   dd    D }|   dd } fdd} j}i }|D ]-}||}	|r.||	n|	df\}
}|r:||
n|
df\}
}|sD|rN|||
 |	||< q!|S )zHide page text and images.c                 S   s   g | ]\}}}}|qS r   r   )rV   r    nameZinvokerr   r   r   r   rY     s    z>ImagesExtractor._hide_page_text_and_images.<locals>.<listcomp>c                 S   sD   | }d}dD ]}|  }|| v rd}||| d  }q||fS )NF)ZBTZTmZTdz2 TrTz 3 Tr)encodereplace)r!   resfoundkbkr   r   r   	hide_text  s   z=ImagesExtractor._hide_page_text_and_images.<locals>.hide_textc                    sX   | }d}dd  j ddD }|D ]}d| d }|| v r'd}||d}q||fS )	NFc                 S   s   g | ]}|d  qS )   r   )rV   rI   r   r   r   rY     s    zSImagesExtractor._hide_page_text_and_images.<locals>.hide_images.<locals>.<listcomp>Tr2   /z Do    )r=   r~   r   )r!   r   r   Z	img_namesr   r   r	   r   r   hide_images  s   z?ImagesExtractor._hide_page_text_and_images.<locals>.hide_imagesF)Zget_xobjectsextendZget_contentsr   Zxref_streamr   )r	   r   r   Z	xref_listr   r   r   sourcer    ra   r!   Z
found_textZfound_imagesr   r   r   r   z  s   
z*ImagesExtractor._hide_page_text_and_imagesr   rI   c                 C   s   |d }|d }t | |}|dkr>t | |}|jr%t |d}d}|}|j|jkr8|j|jkr8t ||}ntd| d|d  v rMt t j|}|S )a  Restore pixmap with soft mask considered.

        References:

            * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList
            * https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-handle-stencil-masks
            * https://github.com/pymupdf/PyMuPDF/issues/670

        Args:
            doc (fitz.Document): pdf document.
            item (list): image instance of ``page.get_images()``.

        Returns:
            fitz.Pixmap: Recovered pixmap with soft mask considered.
        r   r7   NzCIgnore image due to inconsistent size of color and mask pixmaps: %sZCMYKr:   )	r   rn   alphark   rl   loggingwarningupperrm   )r   rI   xsr   masktempr   r   r   rG     s$   zImagesExtractor._recover_pixmapc                 C   s0   ddl }ddl}|  }||||j|jS )znConvert fitz Pixmap to opencv image.

        Args:
            pixmap (fitz.Pixmap): PyMuPDF Pixmap.
        r   N)r[   rr   rp   ZimdecodeZ
frombufferZuint8ZIMREAD_COLOR)r`   r_   ru   Zimg_byter   r   r   r\     s   z#ImagesExtractor._pixmap_to_cv_image)NFr   )r   )__name__
__module____qualname____doc__r   ZPager   rF   boolfloatr"   r%   staticmethodr0   r1   rN   ri   rn   r$   rH   r   ZDocumentr>   rG   r\   r   r   r   r   r      sV    	
/
r
;,,81r   )r   r   r,   r   Zcommon.Collectionr   Zcommon.sharer   Zcommon.algorithmr   r   r   r   r   r   r   r   <module>   s    
