o
    *jY                     @   s   d dl mZ d dlmZmZ d dlZd dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d
dlmZ ejejejdG dd deZdS )    N)AnyDict)MosesDetokenizerMosesPunctNormalizerMosesTokenizer)	apply_bpe)Preprocessors)Preprocessor)PREPROCESSORS)Config)Fields	ModelFile   )	TextClean)module_namec                       sH   e Zd ZdZ	ddedef fddZdedeeef fd	d
Z	  Z
S )CanmtTranslationPreprocessorz3The preprocessor used in text correction task.
    N	model_dir
max_lengthc                    s  ddl m} 	 t j|i | tt|tj	| _
|t|d| _|t|d| _| j | _|d ur=|d nd| _| j
d d | _| j
d d	 | _t | _| jd
kr]t| _nt| jd| _t| jd| _t|| j
d d d | _tt| j| _d S )Nr   )
Dictionaryzdict.src.txtzdict.tgt.txtr      Zpreprocessorsrc_langtgt_langzh)langZsrc_bpefile)Zfairseq.datar   super__init__r   	from_fileospjoinr   ZCONFIGURATIONcfgload	vocab_srcZ	vocab_tgtpadpadding_valuer   r   r   r   tcjiebatokr   Zpunct_normalizerr   Zsrc_bpe_pathr   ZBPEopenbpe)selfr   r   argskwargsr   	__class__ o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/preprocessors/nlp/canmt_translation.pyr      s*   
z%CanmtTranslationPreprocessor.__init__inputreturnc                    s8   j dkr j|} j|}dt|}n fdd|D } fdd|D } j|	 
 }ddd |D } jj|ddd	}tj|d
d}| d }t j|}tj jg||  |jd}	tjt||	gdd}
tjt|	|gdd}tjt||	gdd}t|g}||||
d}|S )u  process the raw input data

        Args:
            data (str): a sentence
                Example:
                    '随着中国经济突飞猛近，建造工业与日俱增'
        Returns:
            Dict[str, Any]: the preprocessed data
            Example:
            {'net_input':
                {'src_tokens':tensor([1,2,3,4]),
                'src_lengths': tensor([4])}
            }
        r    c                    s   g | ]} j |qS r/   )Z_punct_normalizer	normalize.0itemr*   r/   r0   
<listcomp>N   s    z9CanmtTranslationPreprocessor.__call__.<locals>.<listcomp>c                    s   g | ]} j j|d d dqS )T)Z
return_strZaggressive_dash_splits)r'   tokenizer5   r8   r/   r0   r9   O   s    c                 S   s   g | ]}|qS r/   r/   )r6   xr/   r/   r0   r9   V   s    TF)Z
append_eosZadd_if_not_existr   )Zshiftsr   )dtype)dim)Z
src_tokensZsrc_lengthsZprev_src_tokenssources)r   r%   cleanr'   cutr   listr)   Zprocess_linestripsplitr"   Zencode_linetorchZrollsizeminr   Ztensorr$   r<   Z	unsqueezecat)r*   r1   Z	input_tokZ	input_bpetextZinputsZprev_inputslengthsmax_lenpaddingr>   outr/   r8   r0   __call__:   s@   

z%CanmtTranslationPreprocessor.__call__)N)__name__
__module____qualname____doc__strintr   r   r   rM   __classcell__r/   r/   r-   r0   r      s    "!r   )Zos.pathpathr   typingr   r   r&   rD   Z
sacremosesr   r   r   Zsubword_nmtr   Zmodelscope.metainfor   Zmodelscope.preprocessors.baser	   Z modelscope.preprocessors.builderr
   Zmodelscope.utils.configr   Zmodelscope.utils.constantr   r   Z
text_cleanr   Zregister_moduleZnlpZcanmt_translationr   r/   r/   r/   r0   <module>   s    