o
    j.H                     @   s   d Z ddlmZmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ dd	lmZ dd
l	mZmZ ddlmZ ddlmZ G dd deZdS )aI  Text block objects based on PDF raw dict extracted with ``PyMuPDF``.

Data structure based on this `link <https://pymupdf.readthedocs.io/en/latest/textpage.html>`_::

    {
        # raw dict
        # --------------------------------
        'type': 0,
        'bbox': (x0,y0,x1,y1),
        'lines': [ lines ]

        # introduced dict
        # --------------------------------
        'before_space': bs,
        'after_space': as,
        'line_space': ls,

        'alignment': 0,
        'left_space': 10.0,
        'right_space': 0.0,

        'tab_stops': [15.4, 35.0]
    }
    )PtInches)WD_ALIGN_PARAGRAPH   )Lines   )	ImageSpan)RectTypeTextAlignmentlower_round)Block)rgb_component_from_namer   )	constants)docxc                       s   e Zd ZdZd+def fddZedd Zedd	 Zed
d Z	edd Z
edd Zedd Z fddZdd Z fddZdd Zdedededededefd d!Zd"d# Zd$d% Zd&d' Zd(ededededef
d)d*Z  ZS ),	TextBlockzText block.Nrawc                    sL   |pi }d|v r| d t | t| d|dg | _|   d S )Nbbox)parentlines)popsuper__init__r   Zrestoregetr   Zset_text_block)selfr   	__class__ X/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/pdf2docx/text/TextBlock.pyr   )   s
   zTextBlock.__init__c                 C      dd | j D }d|S )zJText content in block. Note image is counted as a placeholder ``<image>``.c                 S      g | ]}|j qS r   )text.0liner   r   r   
<listcomp>:       z"TextBlock.text.<locals>.<listcomp> r   joinr   Z
lines_textr   r   r   r    7      
zTextBlock.textc                 C   r   )z5Raw text content in block without considering images.c                 S   r   r   )raw_textr!   r   r   r   r$   @   r%   z&TextBlock.raw_text.<locals>.<listcomp>r&   r'   r)   r   r   r   r+   =   r*   zTextBlock.raw_textc                 C      t dd | jD S )zZIf this block contains only white space or not. If True, this block is safe to be removed.c                 s       | ]}|j V  qd S N)white_space_onlyr!   r   r   r   	<genexpr>F       z-TextBlock.white_space_only.<locals>.<genexpr>)allr   r   r   r   r   r/   C   s   zTextBlock.white_space_onlyc                 C   s   | j jS )zsAll lines contained in text block must have same text direction. 
        Otherwise, set normal direction.
        )r   text_directionr3   r   r   r   r4   H   s   zTextBlock.text_directionc                    sn   | j rdnd | j }t|}|dkrdS | j d  | j   } fdd}tt||}|| |d  S )z4Average distance between adjacent two physical rows.r   r   Nr   c                       t  fdd| D S )Nc                 3   s,    | ]}t |j d   |j   V  qdS )r   N)absr   r!   idxr   r   r0   [   s   * z>TextBlock.average_row_gap.<locals>.<lambda>.<locals>.<genexpr>maxrowr7   r   r   <lambda>[       z+TextBlock.average_row_gap.<locals>.<lambda>)is_horizontal_textr   group_by_physical_rowslenr   summap)r   rowsnumblock_heightZf_max_row_heightZsum_row_heightr   r7   r   average_row_gapO   s   
zTextBlock.average_row_gapc                 C   s   t | j S )zCount of physical rows.)rA   r   r@   r3   r   r   r   	row_count`   s   zTextBlock.row_countc                    s"   t   }|d| j i |S )Nr   )r   storeupdater   )r   resr   r   r   rI   f   s
   

zTextBlock.storec                 C   s:   t |tttfr|D ]}| j| q
dS | j| dS )zAdd line or lines to TextBlock.N)
isinstancer   listtupler   append)r   Zline_or_linesr#   r   r   r   addn   s
   zTextBlock.addc                    sb   t d}t j||dd | jD ]}t d}|j||d |jD ]}t d}|j||d q qdS )	ztPlot block/line/span area for debug purpose.
        
        Args:
            page (fitz.Page): pdf page.
        bluez[3.0 3.0] 0)strokedashesred)rR   r&   )colorN)r   r   plotr   spans)r   pagerQ   r#   rT   spancr   r   r   rV   w   s   

zTextBlock.plotc                 C   sF   d}|D ]}| tjs|jrq| j|jsq| j|r d}q|S )zParse text format with style represented by rectangles.
        
        Args:
            shapes (Shapes): Shapes representing potential styles applied on blocks.
        FT)Zequal_to_typer	   Z	HYPERLINKZis_determinedr   Z
intersectsr   parse_text_format)r   Zshapesflagshaper   r   r   r[      s   zTextBlock.parse_text_formatline_separate_thresholdline_break_width_ratioline_break_free_space_ratiolines_left_aligned_thresholdlines_right_aligned_thresholdlines_center_aligned_thresholdc                 C   s   | j rdnd\}}	}
| |||	|
f||||| _| jtjkr(tj| _| j| | j}|dkr9| jtjkr9d| _	n|dkrG| jtj
krGd| _n|dkrW| jtjkrWd| _d| _	| j||| dS )a   Set horizontal spacing based on lines layout and page bbox.
        
        * The general spacing is determined by paragraph alignment and indentation.
        * The detailed spacing of block lines is determined by tab stops.

        Multiple alignment modes may exist in block (due to improper organized lines
        from ``PyMuPDF``), e.g. some lines align left, and others right. In this case,
        **LEFT** alignment is set, and use ``TAB`` to position each line.
        )r   r   g      ?)   r   g      r   r   N)r?   _parse_alignment	alignmentr
   NONELEFTr   Zparse_tab_stoprH   right_spaceRIGHT
left_spaceCENTERZparse_line_break)r   r   r^   r_   r`   ra   rb   rc   idx0idx1frH   r   r   r   parse_horizontal_spacing   s.   z"TextBlock.parse_horizontal_spacingc                    s   | j D ]}tdd |jD rtj| _ dS q| jrdnd}| j|d  | j|  }| j  }dd   fd	dt	fd
d|D }|| }t
|dkrTt|tj}|| _dS )al  Calculate relative line spacing, e.g. `spacing = 1.02`.  Relative line spacing is based on standard 
        single line height, which is font-related. 

        .. note::
            The line spacing could be updated automatically when changing the font size, while the layout might
            be broken in exact spacing mode, e.g. overlapping of lines.
        c                 s   s    | ]
}t |tr|V  qd S r.   )rL   r   r"   rY   r   r   r   r0      s    z8TextBlock.parse_relative_line_spacing.<locals>.<genexpr>Nr   r   r   c                 S   r,   )Nc                 s   r-   r.   )line_heightrq   r   r   r   r0      r1   JTextBlock.parse_relative_line_spacing.<locals>.<lambda>.<locals>.<genexpr>)r:   rW   )r#   r   r   r   r=      s    z7TextBlock.parse_relative_line_spacing.<locals>.<lambda>c                    r5   )Nc                 3       | ]} |V  qd S r.   r   r!   fun_max_line_heightr   r   r0          rs   r9   r;   ru   r   r   r=      r>   c                 3   rt   r.   r   )r"   r<   )fun_max_row_heightr   r   r0      rw   )r   rM   rW   r   ZDEFAULT_LINE_SPACING
line_spacer?   r   r@   rB   rA   r:   )r   r#   r8   rF   rD   Zstandard_heightry   r   )rv   rx   r   parse_relative_line_spacing   s   
	

z%TextBlock.parse_relative_line_spacingc                 C   s   | j rdnd}| jd j}||d  ||  }| j|d  | j|  }| j}|dkr3|| |d  }n|}|| _|  j|| 7  _| jdk rU|  j| j| 7  _d| _dS dS )a0  Calculate exact line spacing, e.g. `spacing = Pt(12)`. 

        The layout of pdf text block: line-space-line-space-line, excepting space before first line, 
        i.e. space-line-space-line, when creating paragraph in docx. So, an average line height is 
        ``space+line``. Then, the height of first line can be adjusted by updating paragraph before-spacing.

        .. note::
            Compared with the relative spacing mode, it has a more precise layout, but less flexible editing
            ability, especially changing the font size.
        r   r   r           N)r?   r   r   rH   ry   before_space)r   r8   r   Zfirst_line_heightrF   country   r   r   r   parse_exact_line_spacing   s   

z"TextBlock.parse_exact_line_spacingc           	      C   s  t |}tt| jdd}tt| jdd}t||_t||_| j	dkr0tt| j
d|_nt| j
d|_| j}| jdk rD|| j8 }t||_t| j|_t| j|_| jtjkrtj|_| jD ]}|jt| j|  qbt| jtj d}t||_nB| jtjkrtj|_t|tj d}t||_n*| jtjkrtj|_t|tj d}t||_t| jtj d}t||_ntj|_| jD ]}| | q|S )a  Create paragraph for a text block.

        Refer to ``python-docx`` doc for details on text format:

        * https://python-docx.readthedocs.io/en/latest/user/text.html
        * https://python-docx.readthedocs.io/en/latest/api/enum/WdAlignParagraph.html#wdparagraphalignment
        
        Args:
            p (Paragraph): ``python-docx`` paragraph instance.

        .. note::
            The left position of paragraph is set by paragraph indent, rather than ``TAB`` stop.
        r   r{   r   r   )!r   Zreset_paragraph_formatr:   roundr|   Zafter_spacer   Zspace_beforeZspace_afterZline_space_typery   Zline_spacingrk   first_line_spaceZleft_indentri   Zright_indentZfirst_line_indentrf   r
   rh   r   Z	tab_stopsZadd_tab_stopr   r   ZITPr   rj   rl   JUSTIFYr   	make_docx)	r   ppfZbefore_spacingZafter_spacingrk   posdr#   r   r   r   r      sB   








zTextBlock.make_docxtext_direction_paramc                    sL  |\t | j |   dt | | j   d}t | d dtdt|d}t| |   | _|| _| j }|D ]$tdkrTqKfddt	dtD }	t
|	rotj  S qK fdd}
t|dkr|
 S fdd|D }fd	d|D }d
d t||D }t|dkr|dd |dd }}tt|t| |k}tt|t| |k}tt|t| k}|r|rt|dkrtjn|
 }n|rtj}n|rtj}n|rt|dkrtjntj}ntj}|tjks|tjkr$|d d j |d d j  | _|S )a  Detect text alignment mode based on layout of internal lines. It can't decide when only
        one line, in such case, the alignment mode is determined by externally check.
        
        Args:
            text_direction_param (tuple): ``(x0_index, x1_index, direction_factor)``, 
                e.g. ``(0, 2, 1)`` for horizontal text, while ``(3, 1, -1)`` for vertical text.
        r          @r{   c                    s4   g | ]}| j  |d   j     kqS )r   r   )r"   i)ro   rm   rn   r^   r<   r   r   r$     s    ,z.TextBlock._parse_alignment.<locals>.<listcomp>c                      s*   t k r	tjS d  krtjS tjS )Ng      ?)r6   r
   rl   rh   rj   r   )Wd_centerd_leftrc   r   r   external_alignment  s
   z6TextBlock._parse_alignment.<locals>.external_alignmentc                       g | ]	}|d  j   qS )r   r   r"   r   )rm   r   r   r$         c                    r   )r   r   )rn   r   r   r$     r   c                 S   s   g | ]
\}}|| d  qS )r   r   )r"   Zx0x1r   r   r   r$     s    rd   Nr   r   )r   r   r:   r6   rk   ri   r   r@   rA   rangeanyr
   rg   zipminr   rl   rh   rj   r   )r   r   r   r^   ra   rb   rc   Zd_rightrD   disr   ZX0ZX1XZleft_alignedZright_alignedZcenter_alignedrf   r   )	r   r   r   ro   rm   rn   r^   rc   r<   r   re   t  sN   





&&zTextBlock._parse_alignmentr.   )__name__
__module____qualname____doc__dictr   propertyr    r+   r/   r4   rG   rH   rI   rP   rV   r[   floatrp   rz   r~   r   rN   re   __classcell__r   r   r   r   r   '   s\    





	
5&Tr   N)r   Zdocx.sharedr   r   Zdocx.enum.textr   r   Zimage.ImageSpanr   Zcommon.sharer	   r
   r   Zcommon.Blockr   r   commonr   r   r   r   r   r   r   <module>   s   