o
    "j                     @   s   d Z ddlZddlZddlmZ g ZdZdZdZ	dZ
dZd	Zd
ZdZdZdZdd Zdd Zeddddddd Zeddddddd Zeddddddd Zedddddd"ddZedddddd d! ZdS )#a  
WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is
provided. This module will download dataset from
http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators.

    N)
deprecatedzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgzZ 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgzZ 0791583d57d5beb693b9414c5b36798cz1http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgzZ 0cb4a5366189b6acba876491c8724fa3z<s>z<e>z<unk>   c                 C   s   dd }t j| dd>}dd |D }t|dksJ |||d |}d	d |D }t|dks4J |||d |}||fW  d    S 1 sLw   Y  d S )
Nc                 S   s8   i }t | D ]\}}||k r|||  < q |S |S )N)	enumeratestripdecode)fdsizeZout_dict
line_countline r   U/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/dataset/wmt14.py	__to_dict2   s   z!__read_to_dict.<locals>.__to_dictrmodec                 S      g | ]}|j d r|j qS )zsrc.dictnameendswith.0Z	each_itemr   r   r   
<listcomp><       
z"__read_to_dict.<locals>.<listcomp>   r   c                 S   r   )ztrg.dictr   r   r   r   r   r   C   r   )tarfileopenlenextractfile)tar_file	dict_sizer   fnamessrc_dicttrg_dictr   r   r   __read_to_dict1   s   	$r$   c                    s    fdd}|S )Nc                  3   s&   t \ tjddz} fdd| D }|D ]e}| |D ]]}| }| d}t|dkr6q"|d }| } fddtg| t	g D }|d	 }| }	fd
d|	D }
t|dksjt|
dkrkq"|
t	 g }t g|
 }
||
|fV  q"qW d    d S 1 sw   Y  d S )Nr   r   c                    s   g | ]}|j  r|j qS r   r   r   )	file_namer   r   r   Q   r   z2reader_creator.<locals>.reader.<locals>.<listcomp>	r   r   c                       g | ]}  |tqS r   getUNK_IDXr   w)r"   r   r   r   ^   s    
r   c                    r'   r   r(   r+   )r#   r   r   r   e   s    P   )
r$   r   r   r   r   r   splitr   STARTEND)r    r!   r   r
   Z
line_splitZsrc_seqZ	src_wordsZsrc_idsZtrg_seqZ	trg_wordsZtrg_idsZtrg_ids_nextr   r%   r   )r"   r#   r   readerN   s8   

"zreader_creator.<locals>.readerr   )r   r%   r   r2   r   r1   r   reader_creatorM   s   !r3   z2.0.0zpaddle.text.datasets.WMT14r   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonc                 C      t tjjtdtd| S )a  
    WMT14 training set creator.

    It returns a reader creator, each sample in the reader is source language
    word ID sequence, target language word ID sequence and next word ID
    sequence.

    :return: Training reader creator
    :rtype: callable
    wmt14ztrain/trainr3   paddledatasetcommondownload	URL_TRAIN	MD5_TRAINr   r   r   r   trainr   
   r@   c                 C   r6   )z
    WMT14 test set creator.

    It returns a reader creator, each sample in the reader is source language
    word ID sequence, target language word ID sequence and next word ID
    sequence.

    :return: Test reader creator
    :rtype: callable
    r7   z	test/testr8   r?   r   r   r   test   rA   rB   c                 C   r6   )Nr7   zgen/genr8   r?   r   r   r   gen   s
   rC   Tc                 C   sP   t jjtdt}t|| \}}|r$dd | D }dd | D }||fS )Nr7   c                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>       zget_dict.<locals>.<dictcomp>c                 S   rD   r   r   rE   r   r   r   rH      rI   )r9   r:   r;   r<   r=   r>   r$   items)r   reverser   r"   r#   r   r   r   get_dict   s   	rL   c                   C   s(   t jjtdt t jjtdt d S )Nr7   )r9   r:   r;   r<   r=   r>   	URL_MODEL	MD5_MODELr   r   r   r   fetch   s   rO   )T)__doc__r   Zpaddle.dataset.commonr9   Zpaddle.utilsr   __all__ZURL_DEV_TESTZMD5_DEV_TESTr=   r>   rM   rN   r/   r0   ZUNKr*   r$   r3   r@   rB   rC   rL   rO   r   r   r   r   <module>   sj   	%


