o
    0j1"                     @   sx  d dl Z d dlmZ d dlZe jde jde jde jde de de jde jdd	Zd
efddZ	i ddddddddddddddddddddddddddddddddddi ddd dd!dd"dd#dd$dd%dd&dd'dd(dd)dd*dd+dd,dd-dd.dd/ddddd0Z
d1d2 Zd<d4d5Zd6d7 Zd8d9 Zd:d; ZdS )=    N)Counteru   ^\s*([IVX]+)(?:[\.．\)\s]|$))flagsu   ^\s*([A-Z])(?:[\.．\)\s])u3   ^\s*(\d+(?:\.\d+)*)(?![）)])(?:[\.]?\s*|(?=[A-Z]))u&   ^\s*(?:[\(（])?(\d+(?:\.\d+)*)[\)）]u~   ^\s*(?:第|[（\(])?([一二三四五六七八九十]{1,2})(?:[章节篇卷部条题讲课回）\)]|(?![a-zA-Z\u4e00-\u9fa5])))ROMANLETTERNUM_LISTNUM_LIST_WITH_BRACKETCHINESE_NUMcontentc                 C   s   t |  }td |rdS td |rdS td |r!dS td |r*dS td	 |rFtd	 |d
} | dd
 }d	|fS dS )z7
    Extract numbering type and its semantic level
    r   )ZNUM_LIST_BRACKET   r   )r      r   )r   r   r   )r      r   r   .)N)strstripSYMBOL_PATTERNSmatchgroupcount)r	   txtlevel r   w/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/pipelines/layout_parsing/title_level.pyget_symbol_and_level-   s   r   ZABSTRACTr   ZSUMMARYRESUMEu   绪论u   引言ZCONTENTSZ
REFERENCESZ	REFERENCEu   参考文献ZAPPENDIXZ
APPENDICESu   附录ZACKNOWLEDGMENTSZINTRODUCTIONZBACKGROUNDANDRELATEDWORKZ
BACKGROUNDZRELATEDWORKZTHEORETICALMODELSZDATAZMETHODZMETHODSZMETHODOLOGYZTOPICANALYSISZRESULTZRESULTSZ
DISCUSSIONZCONCLUSIONSZ
CONCLUSIONZLIMITATIONSu   研究背景u   相关工作u   研究方法u   实验结果u   讨论)u   结论u   致谢u   目录c           
      C   s   ddl }| jdkrdS t| jd }t| jd }t|| jd }t|| jd }|| || }}|| }| j dd }	|dkrNt||	 S t||	 S )	zX
    Calculate the average height of the dominant text lines within a layout block.
    r   N	doc_titler   r      
g      ?)mathlabelintZbboxceilr	   r   r   )
blockr   x1y1Zx2y2hwZaspect_ratioZ	lines_numr   r   r   get_title_heightq   s   
r(   r
   c                    s   ddl m} dd | D }tt|}t|dkri S t|t|}t|dd}||ddd	}|	| |j
d}t| }	d
d t|	D }
i }|D ]  fdd|D }tt|}|
| | < qP|S )zC
    Cluster heading heights to infer level based on font size
    r   )KMeansc                 S   s   g | ]}|d  qS )heightr   ).0er   r   r   
<listcomp>   s    z*cluster_global_heights.<locals>.<listcomp>r   r   *   auto)Z
n_clustersZrandom_stateZn_initc                 S   s   i | ]\}}t ||d  qS )r   )r    )r+   Znew_idxoldr   r   r   
<dictcomp>   s    z*cluster_global_heights.<locals>.<dictcomp>c                    s   g | ]}t  | qS r   )abs)r+   cr&   r   r   r-      s    )Zsklearn.clusterr)   sortedsetlenminnparrayZreshapefitZcluster_centers_Zargsort	enumerater    Zargmin)entriesZ
k_clustersr)   ZheightsZuniqkXkmZcentersorderZold2newmappingdistsr0   r   r4   r   cluster_global_heights   s$   
rD   c                 C   sJ   i }d}t | D ]\}}|| \}}|dkr"||vr"|||< |d7 }q|S )z@
    Assign a global ordering to different numbering styles
    r   r   )r<   )r=   title_symbol_levelseqZcounteridxr,   symbolr   r   r   r   compute_global_symbol_seq   s   rI   c                 C   s  i }t | D ]\}}t|d \}}|||d< |d< ||f||< qt| }t| |}d}g }	g }
t | D ]\}}|ddkrAq5|| \}}|dkrNd}nt|d   d	ddt
v rdd	}nd
}||d  }|dkr|}|dkr|dkr||||  }n|}||}n||}|||g}t|d}|d d dkr|d d }n|}n|d	krt
t|d   d	dd }n|}t||d< |	|d  |
|d  q5| S )z.
    Compute final level for each heading
    r	   rH   r   r   Zsemanticu   ：:   Zspecial_wordZclusterr*   r   r   )r<   r   rD   rI   getr   upperr   rstripreplaceSPECIAL_KEYWORDSr   most_commonr    append)r=   rE   rG   r,   rH   r   Zcluster_mapZ
global_seqZfirst_num_levelcontentsZlevelsZbucketZcluster_levelZsemantic_levelZrelative_order_levelZvotesrQ   Zfinal_levelr   r   r   compute_levels_for_entries   s\   
 

 rT   c           
      C   s   g }t | D ]\}}|D ]}t|d| || qqg }|D ]&}|jdkrE|j}t|}|du r2q|jdkr9dnd}|||||d qt|}|D ]}	|	d jdkr]t|	d dd |	d }|	d	 |_qL| S )
z;
    Write computed levels back to the parsing results
    
page_indexZparagraph_titleNr   r   )origin_blockr	   r*   r   rV   title_levelr   )r<   setattrrR   r   r	   r(   rT   rW   )
Zblocks_by_pageZparsing_res_listrU   Zone_page_blocksr"   r=   r	   r*   Z
init_levelr,   r   r   r   assign_levels_to_parsing_res  s:   
	rY   )r
   )recollectionsr   numpyr9   compileIr   r   r   rP   r(   rD   rI   rT   rY   r   r   r   r   <module>   s   	
 !"#)
"Q