o
    *j                     @   s:   d dl Z d dlZd dlZd dlZG dd dZdd ZdS )    Nc                   @   s    e Zd Zdd Zedd ZdS )NLTKSegmenterc                 C   s
   t   d S N)download_nltk)self r   q/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/data_utils/extraction.pyZ__init   s   
zNLTKSegmenter.__initc                 C   s   t j| S r   )nltktokenizeZsent_tokenize)articler   r   r   segment_string   s   zNLTKSegmenter.segment_stringN)__name__
__module____qualname__Z_NLTKSegmenter__initstaticmethodr   r   r   r   r   r   
   s    r   c                  C   sT  t d d} d}t }t|d}tjtj| dddD ]}tjtj|dd	dD ]}t| g }d}t|d
ddd}|D ]}	|		 }	d|	v rOd	}qBd|	v rg g }
}d g }}|dd  D ]R}t
|dkr|rt
|dksyt
|dkr|
| || n|d | d g }}|
|d  ||dd   qct
|dkr|r||d  qc|d }qc|rt
|dkst
|dkr|
| || n|d | dd |D }|
|d}|t| |d d}g }qB|r|	r||	}|| qBW d    n	1 sw   Y  q-qW d    d S 1 s#w   Y  d S )NZpunktzdata/extractedzformatted/wiki-key.txtw*F)	recursivezwiki_*Tr
zutf-8)modenewlineencodingz<doc id=z</doc>   r   c                 S   s   g | ]}d  |qS ) )join).0contentr   r   r   
<listcomp>B   s    
z!download_nltk.<locals>.<listcomp>)keyr   )r   downloadr   openglobospathr   printrstriplenappendwritejsondumpsr   )Z	wiki_pathZoutput_pathZ	segmenteroutputdirnamefilenameZarticle_linesZarticle_openfilelineZkey_sentencescontentsr   r   Z	sentencesr
   r   r   r   r      s   










$r   )r"   r#   r*   r   r   r   r   r   r   r   <module>   s   
