o
    #j                     @   s`   d dl Z d dlZd dlmZ d dlmZ g ZdZdZ	dZ
dZdZd	Zd
ZdZG dd deZdS )    N)_check_exists_and_download)DatasetzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgzZ 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgzZ 0791583d57d5beb693b9414c5b36798cz<s>z<e>z<unk>   c                   @   s>   e Zd ZdZ	dddZdd	 Zd
d Zdd ZdddZdS )WMT14a  
    Implementation of `WMT14 <http://www.statmt.org/wmt14/>`_ test dataset.
    The original WMT14 dataset is too large and a small set of data for set is
    provided. This module will download dataset from
    http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz .

    Args:
        data_file(str): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        mode(str): 'train', 'test' or 'gen'. Default 'train'
        dict_size(int): word dictionary size. Default -1.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True

    Returns:
        Dataset: Instance of WMT14 dataset
            - src_ids (np.array) - The sequence of token ids of source language.
            - trg_ids (np.array) - The sequence of token ids of target language.
            - trg_ids_next (np.array) - The next sequence of token ids of target language.
    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import WMT14

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, src_ids, trg_ids, trg_ids_next):
            ...         return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)

            >>> wmt14 = WMT14(mode='train', dict_size=50)

            >>> for i in range(10):
            ...     src_ids, trg_ids, trg_ids_next = wmt14[i]
            ...     src_ids = paddle.to_tensor(src_ids)
            ...     trg_ids = paddle.to_tensor(trg_ids)
            ...     trg_ids_next = paddle.to_tensor(trg_ids_next)
            ...
            ...     model = SimpleNet()
            ...     src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
            ...     print(src_ids.item(), trg_ids.item(), trg_ids_next.item())
            91 38 39
            123 81 82
            556 229 230
            182 26 27
            447 242 243
            116 110 111
            403 288 289
            258 221 222
            136 34 35
            281 136 137

    NtrainTc                 C   st   |  dv sJ d| |  | _|| _| jd u r)|s J dt|ttd|| _|dks1J d|| _|   d S )N)r   testgenz1mode should be 'train', 'test' or 'gen', but got z>data_file is not set and downloading automatically is disabledZwmt14r   z*dict_size should be set as positive number)lowermode	data_filer   	URL_TRAIN	MD5_TRAIN	dict_size
_load_data)selfr   r   r   download r   [/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/text/datasets/wmt14.py__init__b   s    


zWMT14.__init__c                    s  dd }g _ g _g _tjjdd}dd |D }t|dks%J |||d j_	d	d |D }t|dks@J |||d j_
j d
j   fdd|D }|D ]s}||D ]k}| }| d}t|dkr{qg|d }| }fddtg| tg D }	|d }
|
 }fdd|D }t|	dkst|dkrqg|j
t g }j
t g| }j |	 j| j| qgq`W d    d S 1 sw   Y  d S )Nc                 S   s8   i }t | D ]\}}||k r|||  < q |S |S N)	enumeratestripdecode)fdsizeZout_dict
line_countliner   r   r   Z	__to_dict{   s   z#WMT14._load_data.<locals>.__to_dictr)r   c                 S      g | ]}|j d r|j qS )zsrc.dictnameendswith.0Z	each_itemr   r   r   
<listcomp>       
z$WMT14._load_data.<locals>.<listcomp>   r   c                 S   r   )ztrg.dictr    r#   r   r   r   r%      r&   /c                    s   g | ]}|j  r|j qS r   r    r#   )	file_namer   r   r%      r&   	r   c                       g | ]	} j |tqS r   )src_dictgetUNK_IDXr$   wr   r   r   r%      s    c                    r+   r   )trg_dictr-   r.   r/   r1   r   r   r%      s    P   )src_idstrg_idstrg_ids_nexttarfileopenr   lenextractfiler   r,   r2   r   r   r   splitSTARTENDappend)r   Z_WMT14__to_dictfnamesr!   r   Z
line_splitZsrc_seqZ	src_wordsr4   Ztrg_seqZ	trg_wordsr5   r6   r   )r)   r   r   r   z   sV   	

"zWMT14._load_datac                 C   s.   t | j| t | j| t | j| fS r   )nparrayr4   r5   r6   )r   idxr   r   r   __getitem__   s   zWMT14.__getitem__c                 C   s
   t | jS r   )r9   r4   r1   r   r   r   __len__   s   
zWMT14.__len__Fc                 C   s>   | j | j}}|rdd | D }dd | D }||fS )a  
        Get the source and target dictionary.

        Args:
            reverse (bool): wether to reverse key and value in dictionary,
                i.e. key: value to value: key.

        Returns:
            Two dictionaries, the source and target dictionary.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import WMT14
                >>> wmt14 = WMT14(mode='train', dict_size=50)
                >>> src_dict, trg_dict = wmt14.get_dict()

        c                 S      i | ]\}}||qS r   r   r$   kvr   r   r   
<dictcomp>       z"WMT14.get_dict.<locals>.<dictcomp>c                 S   rF   r   r   rG   r   r   r   rJ      rK   )r,   r2   items)r   reverser,   r2   r   r   r   get_dict   s
   zWMT14.get_dict)Nr   r   T)F)	__name__
__module____qualname____doc__r   r   rD   rE   rN   r   r   r   r   r   (   s    :
>r   )r7   numpyrA   Zpaddle.dataset.commonr   Z	paddle.ior   __all__ZURL_DEV_TESTZMD5_DEV_TESTr   r   r<   r=   ZUNKr.   r   r   r   r   r   <module>   s   