o
    0j-                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZmZmZ ddlmZmZmZ d	d
 Z	dadeeeef  deeeef  dedefddZde
eee
 f dede
eee
 f fddZdd Z dd Z!dd Z"dbddZ#dcd d!Z$d"d# Z%d$d% Z&d&d' Z'G d(d) d)eZ(G d*d+ d+eZ)d,Z*d-Z+d.Z,d/Z-d0Z.d1Z/d2Z0ej1e0 d3e0 d4ej2d5Z3d6efd7d8Z4d9d: Z5d;e)fd<d=Z6d>edefd?d@Z7dAefdBdCZ8d6edeedf fdDdEZ9	Gddd6edHedIedeeeeef df fdJdKZ:	L	L	L	MdedNedOedPedHedQedefdRdSZ;dTdU Z<e1dVej=Z>e1dWej=Z?e1dXZ@e1dXZAdYejBde
eef fdZd[ZCd\ed]ed^edeee
eef f fd_d`ZDdS )f    N)Counter)deepcopy)AnyDictListTupleUnion)Image)	BaseModelcomputed_fieldmodel_validator   )calculate_bbox_areacalculate_overlap_ratio"calculate_projection_overlap_ratioc                 C   s   | j s| d} | S )Nr   )Zis_validbuffer)Zpoly r   o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/pipelines/paddleocr_vl/uilts.py
make_valid"   s   
r   unionpolygon1polygon2modereturnc           
      C   s   zddl m} W n ty   tdw || }||}t|}t|}||j}||j}|dkr8|| S |dkrGt|j|j}|| S |dkrVt|j|j}	||	 S t	d| )a  
    Calculate the overlap ratio between two polygons.

    Args:
        polygon1 (List[Tuple[int, int]]): First polygon represented as a list of points.
        polygon2 (List[Tuple[int, int]]): Second polygon represented as a list of points.
        mode (str, optional): Overlap calculation mode. Defaults to "union".

    Returns:
        float: Overlap ratio value between 0 and 1.
    r   )PolygonzPlease install Shapely library.r   smallZlargezUnknown mode: )
Zshapely.geometryr   ImportErrorr   intersectionZarear   minmax
ValueError)
r   r   r   r   Zpoly1Zpoly2r   r   Z
small_areaZ
large_arear   r   r   calculate_polygon_overlap_ratio(   s(   r!   layout_det_reslayout_shape_modec                    s  t | }dd |d D }t  tt|D ]}|| d \}}}}|| || }	}
|	dk s3|
dk r8 | t|d t|D ]}| v sK| v rLqAt|| d || d d}|| d d	ksj|| d d	kr|d
kr|| d d	kr{ | || d d	kr | qA|dkr|dkrd|| v rt|| d || d d}|dk rqAt|| d }t|| d }|| d || d h}|h d@ rt|dkrd|vs|h dkrqA||kr | qA | qAq fddt|D |d< |S )a.  
    Remove overlapping boxes from layout detection results based on a given overlap ratio.

    Args:
        layout_det_res (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.

    Returns:
        Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
    c                 S   s   g | ]
}|d  dkr|qS )label	referencer   ).0boxr   r   r   
<listcomp>[       z(filter_overlap_boxes.<locals>.<listcomp>boxes
coordinate      r   r$   Zinline_formula      ?gffffff?rectZpolygon_points>   imagetablechartZsealr1   c                    s   g | ]
\}}| vr|qS r   r   )r&   idxr'   Zdropped_indexesr   r   r(      r)   )	r   setrangelenaddr   r!   r   	enumerate)r"   r#   Zlayout_det_res_filteredr*   ix1y1x2y2whjZoverlap_ratioZpoly_overlap_ratioZ
box_area_iZ
box_area_jlabelsr   r4   r   filter_overlap_boxesN   s\   




&
rC   c                 C   s   t | tjr| S t| S )z
    Convert the input to a PIL Image.

    Args:
        img (PIL.Image or numpy.ndarray): Input image.

    Returns:
        PIL.Image: PIL Image object.
    )
isinstancer	   Z	fromarrayimgr   r   r   to_pil_image   s   

rG   c                 C   s   t | tjrt| S | S )z
    Convert the input to a numpy array.

    Args:
        img (PIL.Image or numpy.ndarray): Input image.

    Returns:
        numpy.ndarray: Numpy array image.
    )rD   r	   nparrayrE   r   r   r   to_np_array   s   

rJ   c                 C   s4   dd | D }dd | D }t |}t|}||fS )z
    Calculate width (max of all) and height (sum) for a vertical merge of images.

    Args:
        images (List[PIL.Image or np.ndarray]): List of images.

    Returns:
        Tuple[int, int]: (width, height) of merged image.
    c                 S      g | ]}t |jqS r   )rG   widthr&   rF   r   r   r   r(          z"calc_merged_wh.<locals>.<listcomp>c                 S   rK   r   )rG   heightrM   r   r   r   r(      rN   )r   sum)imageswidthsZheightsr?   r@   r   r   r   calc_merged_wh   s
   
rS   centerautoc                 C   s6  | sdS t | dkrt| d S t|tr|gt | d  }t |t | d kr,tdt| d }tdt | D ]]}t| | }||d  }t|j|j}|j	|j	 }t
d||fd}	|dkrp||j d }
||j d }n|d	kr||j }
||j }nd }
}|	||
df |	|||j	f |	}q9t|S )
a+  
    Merge images vertically with given alignment.

    Args:
        images (List[PIL.Image or np.ndarray]): List of images to merge.
        aligns (str or List[str]): Alignment(s) for each merge step ('center', 'right', 'left').

    Returns:
        np.ndarray: Merged image as numpy array.
    Nr-   r   z,The length of aligns must be len(images) - 1RGB   rX   rX   rT   r   right)r7   rJ   rD   strr    rG   r6   r   rL   rO   r	   newZpaste)rQ   alignsr#   Zmergedr:   Zimg2alignr?   r@   Znew_imgr;   r=   r   r   r   merge_images   s4   

r^   c           )         s  g }i }t  D ]\}}|d v r|||< q|||f qg }g }g }	g }
dd fdd}fdd}t |D ]\}\}}|sL|g}|g}	g }
q;||d  \}}|d	 }|d }|d	 }|d }t||d
}|dko|dko||ko|d |d ko|d |d k o|d |d  t|d |d  |d |d  d k }|dko|dv o||ko|d |d kot|d |d  t|d |d  |d |d  d k o|d |d |d |d A o||| }|rd}n
|r|||}nd}|s|r|| |	| |
| q;||	|
f |g}|g}	g }
q;|r(||	|
f g }|D ]\}}t|t|}}|||||f q,g }t d}|t k rQd}|D ]\}}}}||kr)tfdd|D r)d} fdd|D }|r}|ng } t	|\}!}"|!dkr|"|! nt
d}#|#dkrt |D ]!\}$}% |%  } |% d |d< d|d< || |% qn<t|| |}&t |D ]1\}$}% |%  }|$dkr|&nd|d< |$dkr| nd|d< |d |d< || |% qg }'t|d |D ]}(|(|v r|'|( q|'D ]}(|||(  |( q|d } nqV|r0qK||v rF|vrF|||  | |d7 }|t k sR|S )aK  
    Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.

    Args:
        blocks (List[Dict]): List of block dicts.
        non_merge_labels (List[str]): Block labels that should not be merged.

    Returns:
        List[Dict]: List of processed (and possibly merged) blocks.
    r$   c                 S   s   t | | dkS )N   abs)Za1Za2r   r   r   
is_aligned  s   z merge_blocks.<locals>.is_alignedc                    s0    | d |d rdS  | d |d rdS dS )Nr   leftr   rY   rT   r   )
block_bbox	prev_bbox)rb   r   r   get_alignment  s
   z#merge_blocks.<locals>.get_alignmentc                    s   || d }||  d }t |d |d }t |d |d }t|d |d }t|d |d }||||g}	t|D ]\}
}|
| |fv sJ|d  vrKq:|d }t|	|dkrY dS q:dS )	Nr'   r   r-   r      r$   TF)r   r   r9   r   )	block_idxprev_idxblocksre   rd   r;   r<   r=   r>   Zmin_boxr3   Zother_blockZ
other_bbox)non_merge_labelsr   r   overlapwith_other_box  s    z+merge_blocks.<locals>.overlapwith_other_boxr-   r'   
horizontalr   textr   rg   g333333?)rn   r.   rT   NFc                 3   s    | ]}| vV  qd S Nr   r&   r:   )used_indicesr   r   	<genexpr>a      zmerge_blocks.<locals>.<genexpr>Tc                    s   g | ]} | d  qS rE   r   rp   )rj   r   r   r(   c      z merge_blocks.<locals>.<listcomp>infrF   merge_alignsZgroup_id)r9   appendr   r   ra   r   r5   r7   allrS   floatcopyr8   r^   r6   ))rj   rk   r#   Zblocks_to_mergeZnon_merge_blocksr3   blockZmerged_groupsZcurrent_groupZcurrent_indicesZcurrent_alignsrf   rl   r:   ri   Z
prev_blockre   Z
prev_labelrd   Zblock_labelZiou_hZis_crossZis_updown_alignZ
align_modeZgroup_rangesZgroup_indicesr\   startendZresult_blocksZgroup_foundZimgsrv   r?   r@   Zaspect_ratiorA   rh   Z
merged_imgZinsert_listZn_idxr   )rj   rb   rk   rq   r   merge_blocks   s   
$
$




"





(r~   c              
      s   ddl  d fdd	}dd |D \}}}}|| }|| }	|  }
 j|
||f||fdd	d
  j}d}|||t||	dd\}}}tdt|| }||| d  }||	| d  } j|
|||f||d| j	d |
S )aD  
    Fill a rectangular area in the image with a white background and write the given token string.

    Args:
        image (np.ndarray): Image to paint on.
        box (tuple): (x1, y1, x2, y2) coordinates of rectangle.
        token_str (str): Token string to write.

    Returns:
        np.ndarray: Modified image.
    r   N?c                    sz   d\}}|}|| dkr8|| d } j | ||dd\\}}	}
||| k r0|	|| k r0|}|}n|}|| dks|||	fS )N)g?
   g{Gz?r   r-   )	thickness)ZgetTextSize)rn   ZfontFaceZsquare_size
fill_ratiorc   rY   Zoptimal_scalemidr?   r@   _cv2r   r   get_optimal_font_scale  s   
z+paint_token.<locals>.get_optimal_font_scalec                 S   s   g | ]}t |qS r   )int)r&   vr   r   r   r(     s    zpaint_token.<locals>.<listcomp>rW   )colorr      )r   r-   r   )r   r   r   )ZlineType)r   )
r   rz   Z	rectangleZFONT_HERSHEY_SIMPLEXr   r   mathfloorZputTextZLINE_AA)r0   r'   	token_strr   r;   r<   r=   r>   Zbox_wZbox_hrF   ZfontZthickness_scale_ratioZ
font_scaleZtext_wZtext_hZfont_thicknessZtext_xZtext_yr   r   r   paint_token  s4   
r   c                    s  dd }ddl }|d i }|\}}}}	g  |t|}
||
 t|D ]Q\}}|d \}}}}||krw||krw||krw||	krw | t|| || dk rSq&|| || || || g}dt|
|  d	 }t| ||} |d
 ||< q& fddt|D }| ||fS )a  
    Replace figures in a table area with tokens, return new image and token map.

    Args:
        table_block_img (np.ndarray): Table image.
        table_box (list): Table bounding box [x_min, y_min, x_max, y_max].
        figures (List[Dict]): List of figure dicts (must contain 'coordinate', 'path').

    Returns:
        Tuple[np.ndarray, Dict[str, str], List[str]]:
            - New table image,
            - Token-to-img HTML map,
            - List of figure paths dropped.
    c                 S   sN   h d}g }d}t || k r%tt||@ s|| |d7 }t || k s|S )N>   109r   r-   )r7   r5   rZ   rw   )numZexclude_digitsseqr:   r   r   r   gen_random_map  s   
z0tokenize_figure_of_table.<locals>.gen_random_mapr   Ni   r+      [F]pathc                    s    g | ]\}}| v r|d  qS )r   r   )r&   r:   fZ
drop_idxesr   r   r(          z,tokenize_figure_of_table.<locals>.<listcomp>)	randomseedr7   shuffler9   rw   r   rZ   r   )Ztable_block_imgZ	table_boxZfiguresr   r   Z	token_mapZtable_x_minZtable_y_minZtable_x_maxZtable_y_maxZ
random_mapZ	figure_idZfigureZfigure_x_minZfigure_y_minZfigure_x_maxZfigure_y_maxZdraw_boxr   Zdrop_figuresr   r   r   tokenize_figure_of_table  s8   




r   c                    s     fdd}d}t ||| S )z
    Replace tokens in a string with their HTML image equivalents.

    Args:
        table_res_str (str): Table string with tokens.
        figure_token_map (dict): Mapping from tokens to HTML img tags.

    Returns:
        str: Untokenized string.
    c                    s   |  d}d| d} ||  d}|d }|d u r#|  dS g }|d|dddd	 d|}|jdkrJ|j}|d
| d
 7 }|S )Nr-   r   r   r   z<img src="{}" alt="Image"" />z-
 
 

)groupgetrw   formatreplacejoincontent)matchZtoken_idtokenZimg_pathZ	img_blockZimg_tagsZ
image_infoZocr_contentfigure_token_mapimage_path_to_obj_mapr   r   repl  s"   



z(untokenize_figure_of_table.<locals>.replz
\[F(\d+)\])resub)Ztable_res_strr   r   r   patternr   r   r   untokenize_figure_of_table  s   r   c                   @   s   e Zd ZU dZdZeed< dZeed< eed< eed< eed< eed< eed	< d
Z	e
ed< d
Ze
ed< d
Ze
ed< eddededefddZdS )	TableCella  
    TableCell represents a single cell in a table.

    Attributes:
        row_span (int): Number of rows spanned.
        col_span (int): Number of columns spanned.
        start_row_offset_idx (int): Start row index.
        end_row_offset_idx (int): End row index (exclusive).
        start_col_offset_idx (int): Start column index.
        end_col_offset_idx (int): End column index (exclusive).
        text (str): Cell text content.
        column_header (bool): Whether this cell is a column header.
        row_header (bool): Whether this cell is a row header.
        row_section (bool): Whether this cell is a row section.
    r-   row_spancol_spanstart_row_offset_idxend_row_offset_idxstart_col_offset_idxend_col_offset_idxrn   Fcolumn_header
row_headerrow_sectionbefore)r   datar   c                 C   sl   t |tr4d|v r|S |d dd}t|s0|dd}|r,|D ]
}||d d 7 }q!| }||d< |S )z
        Create TableCell from dict, extracting 'text' property correctly.

        Args:
            data (Any): Input data.

        Returns:
            Any: TableCell-compatible dict.
        rn   Zbboxr   r   Ztext_cell_bboxesNr   )rD   r   r   r7   popstrip)clsr   rn   Z
text_cellselr   r   r   from_dict_formatC  s   
zTableCell.from_dict_formatN)__name__
__module____qualname____doc__r   r   __annotations__r   rZ   r   boolr   r   r   classmethodr   r   r   r   r   r   r   '  s   
 r   c                   @   sX   e Zd ZU dZg Zee ed< dZe	ed< dZ
e	ed< eedeee  fddZd	S )
	TableDataz
    TableData holds a table's cells, row and column counts, and provides a grid property.

    Attributes:
        table_cells (List[TableCell]): List of table cells.
        num_rows (int): Number of rows.
        num_cols (int): Number of columns.
    table_cellsr   num_rowsnum_colsr   c                    s|    fddt  jD } jD ],}t t|j jt|j jD ]}t t|j jt|j jD ]}||| |< q1q q|S )z
        Returns a 2D grid of TableCell objects for the table.

        Returns:
            List[List[TableCell]]: Table as 2D grid.
        c                    s$   g | ]  fd dt jD qS )c              	      s&   g | ]}t d   d ||d dqS )r   r-   )rn   r   r   r   r   )r   r&   rA   r:   r   r   r(   u  s    z-TableData.grid.<locals>.<listcomp>.<listcomp>)r6   r   )r&   selfr   r   r(   t  s    
z"TableData.grid.<locals>.<listcomp>)	r6   r   r   r   r   r   r   r   r   )r   
table_datacellr:   rA   r   r   r   gridk  s    
	
	zTableData.gridN)r   r   r   r   r   r   r   r   r   r   r   r   propertyr   r   r   r   r   r   ]  s   
 	r   z<nl>z<fcel>z<ecel>z<lcel>z<ucel>z<xcel>z+(?:<fcel>|<ecel>|<nl>|<lcel>|<ucel>|<xcel>)z.*?(?=z|$))flagssc              	   C   sL   dd ttttttg d }t|| }t	|| }dd |D }||fS )z
    Extract OTSL tags and text parts from the input string.

    Args:
        s (str): OTSL string.

    Returns:
        Tuple[List[str], List[str]]: (tokens, text_parts)
    (|)c                 S   s   g | ]}|  r|qS r   r   )r&   r   r   r   r   r(     rt   z0otsl_extract_tokens_and_text.<locals>.<listcomp>)
r   OTSL_NL	OTSL_FCEL	OTSL_ECEL	OTSL_LCEL	OTSL_UCEL	OTSL_XCELr   findallsplit)r   r   tokensZ
text_partsr   r   r   otsl_extract_tokens_and_text  s   r   c                    s~  t  dd t| fddD }g }d}d}|rtdd |D }|D ]}t||k r7|t t||k s,q$g }d}	|D ]N}|D ]4}
||
 |	t| k rv| |	 |
krv|	d7 }	|	t| k rv| |	 t tttt	t
fvrv|| |	  |	d7 }	qB|t  |	t| k r| |	 t kr|	d7 }	q>|} d	d
 }dd }t| D ]\}}d}|ttfv r!d}d}d}|tkr| |d  }d}|| t| k r| ||  nd}d}|d t|k r|t||d  k r||d  | }|tt
fv r||||d |tt
g7 }|t	t
fv r|||||d t	t
g7 }|t| ||||| ||| d |tttt	t
fv r/|d7 }|t kr:|d7 }d}q||fS )a  
    Parse OTSL text and tags into TableCell objects and tag structure.

    Args:
        texts (List[str]): List of tokens and text.
        tokens (List[str]): List of OTSL tags.

    Returns:
        Tuple[List[TableCell], List[List[str]]]: (table_cells, split_row_tokens)
    c                 S   s   g | ]
\}}|st |qS r   )list)r&   xyr   r   r   r(     s    z$otsl_parse_texts.<locals>.<listcomp>c                    s   |  kS ro   r   )zZ
split_wordr   r   <lambda>  s    z"otsl_parse_texts.<locals>.<lambda>r   c                 s       | ]}t |V  qd S ro   r7   r&   rowr   r   r   rr     rs   z#otsl_parse_texts.<locals>.<genexpr>r-   c                 S   sP   d}|}| | | |v r&|d7 }|d7 }|t | | kr|S | | | |v s|S Nr   r-   r   )r   c_idxr_idxwhich_tokensspanZ
c_idx_iterr   r   r   count_right  s   z%otsl_parse_texts.<locals>.count_rightc                 S   sL   d}|}| | | |v r$|d7 }|d7 }|t | kr|S | | | |v s|S r   r   )r   r   r   r   r   Z
r_idx_iterr   r   r   
count_down  s   z$otsl_parse_texts.<locals>.count_downr   r   )rn   r   r   r   r   r   r   )r   	itertoolsgroupbyr   r7   rw   r   r   r   r   r   r9   r   r   )textsr   split_row_tokensr   r   r   Zmax_colsr   Z	new_textsZtext_idxr   r   r   r:   rn   Z	cell_textr   r   Zright_offsetZnext_right_cellZnext_bottom_cellr   r   r   otsl_parse_texts  s   





r   r   c              
   C   s  | j }| j}t| jdkrdS d}| j}t|D ]f}|d7 }t|D ]W}|| | }|j|j}}	|j|j	}
}|	|ks@||krAq"t
|j }|jrNdnd}| }|dkr_|d| d7 }|
dkrk|d	|
 d7 }|d
| d| d| d7 }q"|d7 }qd| d}|S )z
    Export TableData to HTML table.

    Args:
        table_data (TableData): TableData object.

    Returns:
        str: HTML string.
    r   r   z<tr>thtdr-   z
 rowspan=""z
 colspan="<>z</z</tr>z<table>z</table>)r   r   r7   r   r   r6   r   r   r   r   htmlescapern   r   r   )r   ZnrowsZncolsbodyr   r:   rA   r   ZrowspanZrowstartZcolspanZcolstartr   ZcelltagZopening_tagr   r   r   export_to_html$  s2   

r  otsl_strc                    s  t | tsJ |  } t| vr| t S | t}g }|D ].}|s!qt|}|s)qt|}d}t|D ]\}}|	t
r@|d }q3||||d q|sOtS |rZtdd |D nd}	|rgtdd |D nd}
|	}t|	|
}td}|}t||d D ] t fdd|D }||k r|} }q}g }|D ](}|d	 }t|}||kr|d
| }ntg||  }|| }|d| qt|t S )z
    Pad OTSL string to a square (rectangular) format, ensuring each row has equal number of cells.

    Args:
        otsl_str (str): OTSL string.

    Returns:
        str: Padded OTSL string.
    r   r-   )	raw_cells	total_lenmin_lenc                 s       | ]}|d  V  qdS )r  Nr   r   r   r   r   rr   i  rs   z%otsl_pad_to_sqr_v2.<locals>.<genexpr>c                 s   r  r  Nr   r   r   r   r   rr   j  rs   ru   c                 3   s     | ]}t |d    V  qdS r  r`   r   rL   r   r   rr   q  s    r
  Nr   )rD   rZ   r   r   r   OTSL_FIND_PATTERNr   r7   r9   
startswithr   rw   r   ry   r6   rP   r   r   )r	  linesZrow_dataliner
  r  r  r:   Zcell_strZglobal_min_widthZmax_total_lenZsearch_startZ
search_endZmin_total_costZoptimal_widthZcurrent_total_costZrepaired_linesr   cellsZcurrent_lenZ	new_cellspaddingr   r  r   otsl_pad_to_sqr_v2I  sZ   





r  otsl_contentc                 C   sR   t | } t| \}}t||\}}tt||r tdd |D nd|d}t|S )z
    Convert OTSL-v1.0 string to HTML. Only 6 tags allowed: <fcel>, <ecel>, <nl>, <lcel>, <ucel>, <xcel>.

    Args:
        otsl_content (str): OTSL string.

    Returns:
        str: HTML table.
    c                 s   r   ro   r   r   r   r   r   rr     rs   z'convert_otsl_to_html.<locals>.<genexpr>r   )r   r   r   )r  r   r   r   r7   r   r  )r  r   Zmixed_textsr   r   r   r   r   r   convert_otsl_to_html  s   
r  c                 C   sT   t | }td|d d D ]}|| dkr'| d| }|||  | kr'|  S qdS )z
    Find the shortest substring that repeats to form the entire string.

    Args:
        s (str): Input string.

    Returns:
        str or None: Shortest repeating substring, or None if not found.
    r-   r   r   N)r7   r6   )r   nr:   Z	substringr   r   r   !find_shortest_repeating_substring  s   
r     r_   r  min_repeatsc                 C   s   t t| | |d dD ]<}| | d }| || rHd}| }||r5|d|  }|d7 }||s%t| ||  }| d| ||f  S qdS )a!  
    Detect if string ends with a repeating phrase.

    Args:
        s (str): Input string.
        min_len (int): Minimum length of unit.
        min_repeats (int): Minimum repeat count.

    Returns:
        Tuple[str, str, int] or None: (prefix, unit, count) if found, else None.
    r-   r   Nr   )r6   r7   endswith)r   r  r  r:   unitcountZtemp_sstart_indexr   r   r   find_repeating_suffix  s   

r!  r     r   line_thresholdchar_threshold	min_countc                 C   s  t | |k r| S |  }|s| S d|vr6t |dkr6t|ddd}|r6|\}}}	t ||	 t |d kr6|S d|vrTt ||krTt|}|rTt |t | }	|	|krT|S dd | dD }
|
sb| S t |
}||k rl| S t|
}|d	d
 \}}	|	|kr|	| dkr|S | S )a  
    Detect and truncate character-level, phrase-level, or line-level repetition in content.

    Args:
        content (str): Input text.
        line_threshold (int): Min lines for line-level truncation.
        char_threshold (int): Min repeats for char-level truncation.
        min_len (int): Min length for char-level check.

    Returns:
        Union[str, str]: (truncated_content, info_string)
    r   d   r  r_   )r  r  r.   c                 S   s   g | ]
}|  r|  qS r   r   )r&   r  r   r   r   r(     s    z/truncate_repetitive_content.<locals>.<listcomp>r-   r   g?)r7   r   r!  r  r   r   most_common)r   r#  r$  r  r%  Zstripped_contentZsuffix_matchprefixZrepeating_unitr  r  Ztotal_linesZline_countsZmost_common_liner   r   r   truncate_repetitive_content  s8   
r)  c                 C   s   dd l }t| jdkr|| |j}n|  }|jtjkr#|	tj}|
 }| }||kr1| S || ||  d }|	tj}||dd|j\}}||}|d u rW| S ||\}	}
}}| |
|
| |	|	| f }|S )Nr   rg   rX      )r   r7   shapeZcvtColorZCOLOR_BGR2GRAYrz   ZdtyperH   Zuint8Zastyper   r   	thresholdZTHRESH_BINARY_INVZfindNonZeroZboundingRect)rF   r   grayZmax_valZmin_valr   r   binaryZcoordsr   r   r?   r@   Zcroppedr   r   r   crop_margin  s&   
r/  z#<\|TEXT_START\|>(.*?)<\|TEXT_END\|>z!<\|LOC_BEGIN\|>(.*?)<\|LOC_END\|>z<\|LOC_(\d+)\|>r0   c                 C   s   | j dd \}}|dk r?|dk r?t| } |d |d }}ztjj}W n ty0   tj}Y nw | ||f|} t| }|S | }|S )zG
    Post-process the input image to extract location information.
    Nr   i  )r+  rG   r	   Z
ResamplingZLANCZOSAttributeErrorresizerJ   )r0   r@   r?   Z	process_wZ	process_hZresample_filterZinference_imgr   r   r   pre_process_for_spotting%  s   
r2  	input_strr?   r@   c                    s  t | tsJ t| }t| }g }g }tt|t|}t|D ]B}||  }	t	|| }
t|
dk r8q"t
tt|
dd fddtdddD } fdd|D }|| ||	 q"|ri|st
t| }d}d}|d t|k r|||d  }d	d |D fd
dtdddD } fdd|D }| ||d   }| }	||	 || |d  }|d7 }|d t|k s|d|}||d}||fS )zL
    Post-process the input string to extract text and location blocks.
    r  Nc                        g | ]} |  |d   fqS r-   r   r   valsr   r   r(   R  r   z-post_process_for_spotting.<locals>.<listcomp>r   r   c                    ,   g | ]}|d  d  |d d   fqS r   g     @@r-   r   r&   pr@   r?   r   r   r(   S     ,    c                 S   s   g | ]	}t |d qS r5  )r   r   )r&   mr   r   r   r(   ^  s    c                    r4  r5  r   r   r6  r   r   r(   _  r   c                    r8  r9  r   r:  r<  r   r   r(   `  r=  r   r   )	rec_polys	rec_texts)rD   rZ   ANNOT_TEXT_REr   LOC_BLOCK_REr   r7   r6   r   LOC_ITEM_REr   mapr   rw   LOC_TOKEN_REfinditerr|   r}   r   )r3  r?   r@   r   Z
loc_blocksr@  rA  r  r:   txtZ	loc_itemsZptsmatchesZlast_endr   Z	text_spanZ
result_strZspotting_resr   )r@   r7  r?   r   post_process_for_spotting:  sF   






rJ  )r   )rT   rU   )rU   )r  r_   )r   r   r   r"  )Er  r   r   r   collectionsr   rz   r   typingr   r   r   r   r   numpyrH   ZPILr	   Zpydanticr
   r   r   Zlayout_parsing.utilsr   r   r   r   r   rZ   ry   r!   rC   rG   rJ   rS   r^   r~   r   r   r   r   r   r   r   r   r   r   r   ZNON_CAPTURING_TAG_GROUPcompileDOTALLr  r   r   r  r  r  r  r!  r)  r/  SrB  rC  rD  rF  Zndarrayr2  rJ  r   r   r   r   <module>   s   	
&
C

) !><$62s%:

9 

