o
    jK                     @   sZ   d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ G dd	 d	e	eZd
S )aC  Document layout depends on Blocks and Shapes.

**Layout** here refers to the content and position of text, image and table. The target is to convert
source blocks and shapes to a *flow layout* that can be re-created as docx elements like paragraph and
table. In addition to ``Section`` and ``Column``, ``TableBlock`` is used to maintain the page layout .
So, detecting and parsing table block is the principle steps.

The prerequisite work is done before this step:

1. Clean up source blocks and shapes in Page level, e.g. convert source blocks to ``Line`` level,
   because the block structure determined by ``PyMuPDF`` might be not reasonable.
#. Parse structure in document level, e.g. page header/footer.
#. Parse Section and Column layout in Page level.

The page layout parsing idea:

1. Parse table layout in Column level.
    (a) Detect explicit tables first based on shapes.
    (#) Then, detect stream tables based on original text blocks and parsed explicit tables.
    (#) Move table contained blocks (lines or explicit table) to associated cell-layout.
#. Parse paragraph in Column level.
    (a) Detect text blocks by combining related lines.
    (#) Parse paragraph style, e.g. text format, alignment
#. Calculate vertical spacing based on parsed tables and paragraphs.
#. Repeat above steps for cell-layout in parsed table level.
    )ABCabstractmethod   )Line)	constants)Element)Shapesc                       s   e Zd ZdZd fdd	Zeedd Z fddZd	e	fd
dZ
defddZdefddZdd Zdd Zdd Zdd Z  ZS )Layoutz(Blocks and shapes structure and formats.Nc                    s\   ddl m } ddlm} |rd|ini }t | || d| _t| d| _|| d| _dS )z Initialize layout. Note that layout bbox must be set explicitly,
        rather than calculated automatically from contained blocks and shapes.   )Blocksr   )TablesConstructorbbox)parentN)	r   Ztable.TablesConstructorr   super__init__blocksr   shapes_table_parser)selfr   r   r   raw	__class__ W/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/pdf2docx/layout/Layout.pyr   &   s   zLayout.__init__c                 C   s   dS )zWorking bbox of current Layout.Nr   )r   r   r   r   working_bbox9   s    zLayout.working_bboxc                    s*   t   }|| j | j d |S )z#Store parsed layout in dict format.)r   r   )r   storeupdater   r   )r   resr   r   r   r   ?   s   
zLayout.storedatac                 C   s>   |  |dd | j|dg  | j|dg  | S )z#Restore Layout from parsed results.r   )r   r   r   r   r   r   )Zupdate_bboxgetr   restorer   )r   r   r   r   r   r    I   s   zLayout.restorer   c                 C   s   |D ]}|  | qdS )zAdd blocks (line or table block) to this layout.

        Args:
            blocks (list): a list of text line or table block to add.

        .. note::
            If a text line is partly contained, it must deep into span -> char.
        N)_assign_block)r   r   blockr   r   r   assign_blocksQ   s   	zLayout.assign_blocksr   c                 C   s(   |D ]}| j |jr| j| qdS )zlAdd shapes to this cell.

        Args:
            shapes (list): a list of Shape instance to add.
        N)r   
intersectsr   r   append)r   r   shaper   r   r   assign_shapes]   s   zLayout.assign_shapesc                 K   sT   | j sdS | jdi | | jdi | tdd | j D ]
}|jdi | qdS )z]Parse layout.

        Args:
            settings (dict): Layout parsing parameters.
        Nc                 S   s   | j S N)Zis_table_block)er   r   r   <lambda>w   s    zLayout.parse.<locals>.<lambda>r   )r   _parse_table_parse_paragraphfilterparse)r   settingsr"   r   r   r   r.   h   s   
zLayout.parsec                 C   sX   | j |tjdr| j| dS t|tr(| j|jr*| j|| j dS dS dS )z/Add block (line or table block) to this layout.)	thresholdN)	containsr   ZFACTOR_MAJORr   r%   
isinstancer   r   r$   )r   r"   r   r   r   r!   {   s
   zLayout._assign_blockc                 K   sP   |d r| j |d |d |d  |d r&| j |d |d |d  dS dS )a  Parse table layout:

        * detect explicit tables first based on shapes,
        * then stream tables based on original text blocks and parsed explicit tables;
        * move table contained blocks (text block or explicit table) to associated cell layout.
        Zparse_lattice_tableZconnected_border_toleranceZmin_border_clearanceZmax_border_widthZparse_stream_tableline_separate_thresholdN)r   Zlattice_tablesZstream_tablesr   r/   r   r   r   r+      s   zLayout._parse_tablec              	   K   sd   | j |d |d |d  | j | jj|d  | j |d |d |d |d |d |d	  d
S )zlCreate text block based on lines, and parse text format, e.g. text highlight,
        paragraph indentation Zmax_line_spacing_ratioZline_break_free_space_ratioZnew_paragraph_free_space_ratioZdelete_end_line_hyphenr3   Zline_break_width_ratioZlines_left_aligned_thresholdZlines_right_aligned_thresholdZlines_center_aligned_thresholdN)r   Zparse_blockZparse_text_formatr   Ztext_style_shapesZparse_spacingr4   r   r   r   r,      s"   zLayout._parse_paragraphr(   )__name__
__module____qualname____doc__r   propertyr   r   r   dictr    listr#   r'   r.   r!   r+   r,   __classcell__r   r   r   r   r	   #   s    
r	   N)r8   abcr   r   Z	text.Liner   commonr   Zcommon.Elementr   Zshape.Shapesr   r	   r   r   r   r   <module>   s    