o
    0jR0                     @   sD   d dl Z dd Zdd Zdd Zdd	 Zd
d Zdd Zdd ZdS )    Nc                 C   s   | | dkrdS |dkr| |d  nd}|d t | k r"| |d  nd}| s,| r.dS | s6| r8dS |dv r>dS dS )zH
    Check if the given character is a sentence ending punctuation.
    .Fr       )r    	
"'u   ”u   ’)u   】u   」u   》T)lenisdigitisalpha)textiprevnext r   t/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/pipelines/pp_doctranslation/utils.py_is_sentence_dot   s    r   c                 C   s<  t | d }g d}t|t | D ]}| | |v rP|d }|t | k r9| | dv r9|d7 }|t | k r9| | dv s)|t | k rOt | d| |krO||f  S q| | dkrt| |r|d }|t | k r{| | dv r{|d7 }|t | k r{| | dv sk|t | k rt | d| |kr||f  S qt|ddD ]w}| | |v r|d }|t | k r| | dv r|d7 }|t | k r| | dv st | d| |kr||f  S q| | dkrt| |r|d }|t | k r| | dv r|d7 }|t | k r| | dv st | d| |kr||f  S qt|t | t|t | fS )	z
    Find the position to split the text into two chunks.

    Args:
        text (str): The original text to be split.
        chunk_size (int): The maximum size of each chunk.

    Returns:
        int: The index where the text should be split.
       )r   u   。;u   ；!u   ！?u   ？r   z 	
Nr   r   )r   ranger   min)r   
chunk_sizecenterZsplit_charsr   jr   r   r   _find_split_pos)   sJ     r   c           
      C   sz   |   } t| |kr|| S t| |\}}| d| }| |d }| || }|r/t|||}|r7t|||}	|| |	 S )af  
    Split the text recursively and translate each chunk.

    Args:
        text (str): The original text to be split.
        chunk_size (int): The maximum size of each chunk.
        translate_func (callable): A function that translates a single chunk of text.
        results (list): A list to store the translated chunks.

    Returns:
        None
    N)stripr   r   split_text_recursive)
r   r   translate_funcZ	split_posZend_whitespaceleftright
whitespaceZ	left_textZ
right_textr   r   r   r!   Z   s   r!   c                 C   s   |   d}|d ds|d drB|d }|d ds'|d dr+|d nd}|r8d|dd nd|dd }nd}d}| }t|||}d	d
 |dD }	d|	}
|ri| d|
 d| n|
}|| dS )a{  
    Translate a code block and append the result to the results list.

    Args:
        code_block (str): The code block to be translated.
        chunk_size (int): The maximum size of each chunk.
        translate_func (callable): A function that translates a single chunk of text.
        results (list): A list to store the translated chunks.

    Returns:
        None
    r   r   ```~~~r   r   r   Nc                 S   s,   g | ]}|  d s|  ds|qS )r&   r'   )r    
startswith).0liner   r   r   
<listcomp>   s    z(translate_code_block.<locals>.<listcomp>)r    splitr(   joinr!   append)Z
code_blockr   r"   resultslinesheaderfooterZcode_contentZtranslated_code_linesZfiltered_code_linesZtranslated_coderesultr   r   r   translate_code_blockx   s,   
*
r4   c           &      C   s  ddl }ddlm} | ddk r3| ddk r3| d| dkr3t| |k r3|| }|| dS || d}t }g }	g }
|jddd	D ]*}|d
dg}|rpt	||vrp|
  }|ri|	| |
| |t	| qF|}d}|t|	k rg }g }d}|t|	k r|t|
|  |kr||	|  ||
|  |t|
| 7 }|d7 }|t|	k r|t|
|  |ksd}||}||}||}t||D ]\}}|  ||d}|jD ]
}||| qq|t|	k s{g }|jddd	D ]}|d
dgs| r|| qd}t|}||k rg }g }td}||k rs||  }t||krHt|||} || |  |d7 }q#d| d}!|t|! |krXn|||  ||! |t|!7 }|d7 }||k s(|s||d   }d| d}!||d  g}|!g}|rdd| d }||}||d}"|"d}#t||#D ]\}$}%|$|%
  q||k s|t| dS )a{  
    Translate a HTML block and append the result to the results list.

    Args:
        html_block (str): The HTML block to be translated.
        chunk_size (int): The maximum size of each chunk.
        translate_func (callable): A function that translates a single chunk of text.
        results (list): A list to store the translated chunks.

    Returns:
        None
    r   NBeautifulSoup<   >html.parserT)string	recursivetdthr   Z__TD__z	<ol></ol>z<li>z</li>z<ol>r   z</ol>Zli)copybs4r6   countr   r.   setZfind_allZfind_parentidZdecode_contentsr    addr-   r,   zipclearcontentsdeepcopyr!   Zreplace_withstr)&Z
html_blockr   r"   r/   r?   r6   Z
translatedsoupZtd_seenZtd_batch_nodesZtd_batch_textsnodeZ	parent_tdZtd_textZ
batch_sizer   Zbatch_nodesZbatch_textsZcurrent_lengthplaceholderZ
batch_textZtranslated_batchZtranslated_linesZtd_noder*   fragchildZ
text_nodesidxtotalZli_textsZ	node_textZtranslated_textZli_strZ
trans_soupZtranslated_lisZ	orig_nodeZli_tagr   r   r   translate_html_block   s   















#rQ   c                 C   s  ddl m} || d}g }g }d}d}d}|jD ].}t|dr@|jdur@t|}	||}
||	 ||
 ||
7 }|d7 }q|t|7 }qg }t||}d}t	|D ]4\}}|\}}|t
|k r|| |v r||| || }|d7 }d	|f||< |t
|k r|| |v shqT|S )
z.
    Split the original text into chunks.
    r   r5   r:   z<<HTML_BLOCK_{}>>r   nameNr   html)r@   r6   rG   hasattrrR   rI   formatr.   split_and_append_text	enumerater   replace)r   r6   rJ   Zhtml_blocksZhtml_placeholdersZplaceholder_fmtZtext_after_placeholderindexelemZhtml_strrL   Zsplited_blockZcurrent_indexrO   block_contentr   r   r   split_original_texts  s@   






r^   c                 C   s   |  rltdtj}d}||D ]5}| |kr:|||  }td|}|D ]}|  r9| d|  f q*| d| f |	 }q|t
|k rl||d }td|}|D ]}|  rk| d|  f q\| S )a  
    Split the text and append the result to the result list.

    Args:
        result (list): The current result list.
        text_content (str): The text content to be processed.

    Returns:
        list: The updated result list after processing the text content.
    z(```.*?\n.*?```|~~~.*?\n.*?~~~)r   z\n{2,}r   codeN)r    recompileDOTALLfinditerstartr,   r.   groupendr   )r3   Ztext_contentZcode_patternZlast_posmZnon_codeZ
paragraphspr   r   r   rV   H  s*   
rV   )r`   r   r   r!   r4   rQ   r^   rV   r   r   r   r   <module>   s   1+w.