o
    #j                     @   sP   d dl Z d dlZd dlZd dlmZ d dlmZ g ZdZ	dZ
G dd deZdS )    N)_check_exists_and_download)Datasetz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgzZ 30177ea32e27c525793142b6bf2c8e2dc                   @   sP   e Zd ZdZ						ddd	Zdd
dZdd Zdd Zdd Zdd Z	dS )Imikolova  
    Implementation of imikolov dataset.

    Args:
        data_file(str): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
        window_size(int): sliding window size for 'NGRAM' data. Default -1.
        mode(str): 'train' 'test' mode. Default 'train'.
        min_word_freq(int): minimal word frequence for building word dictionary. Default 50.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True

    Returns:
        Dataset: instance of imikolov dataset

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import Imikolov

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, src, trg):
            ...         return paddle.sum(src), paddle.sum(trg)


            >>> imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)

            >>> for i in range(10):
            ...     src, trg = imikolov[i]
            ...     src = paddle.to_tensor(src)
            ...     trg = paddle.to_tensor(trg)
            ...
            ...     model = SimpleNet()
            ...     src, trg = model(src, trg)
            ...     print(src.item(), trg.item())
            2076 2075
            2076 2075
            675 674
            4 3
            464 463
            2076 2075
            865 864
            2076 2075
            2076 2075
            1793 1792

    NNGRAMtrain2   Tc                 C   s   |  dv sJ d| |  | _| dv sJ d| | | _|| _|| _|| _| jd u rA|s8J dt|tt	d|| _| 
|| _|   d S )N)r   SEQz,data type should be 'NGRAM', 'SEQ', but got )r   testz(mode should be 'train', 'test', but got z;data_file is not set and downloading automatically disabledZimikolov)upper	data_typelowermodewindow_sizemin_word_freq	data_filer   URLMD5_build_work_dictword_idx
_load_anno)selfr   r   r   r   r   download r   ^/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/text/datasets/imikolov.py__init__T   s*   	



zImikolov.__init__c                 C   sb   |d u r	t t}|D ]#}|  D ]
}||  d7  < q|d  d7  < |d  d7  < q|S )N   <s><e>)collectionsdefaultdictintstripsplit)r   f	word_freqlwr   r   r   
word_count{   s   
zImikolov.word_countc                    s   d}d}t  jQ}||}||} | |}d|v r%|d=  fdd| D }t|dd d}tt| \}	}
t	tt|	t
t|	}t|	|d< W d    |S 1 s^w   Y  |S )	Nz$./simple-examples/data/ptb.train.txtz$./simple-examples/data/ptb.valid.txt<unk>c                    s   g | ]}|d   j kr|qS )r   )r   ).0xr   r   r   
<listcomp>   s    z-Imikolov._build_work_dict.<locals>.<listcomp>c                 S   s   | d  | d fS )Nr   r   r   )r+   r   r   r   <lambda>   s    z+Imikolov._build_work_dict.<locals>.<lambda>)key)tarfileopenr   extractfiler(   itemssortedlistzipdictrangelen)r   cutoffZtrain_filenameZtest_filenametfZtrainfZtestfr%   Zword_freq_sortedwords_r   r   r,   r   r      s&   



zImikolov._build_work_dictc              	      sp  g _ tj}dj d}||}jd  |D ]}jdkrgjdks-J ddg|	 
  dg }t|jkrf fd	d
|D }tjt|d D ]}j t||j |  qTqjdkr|	 
 } fdd
|D }jd g| }|jd g }jdkrt|jkrqj ||f qtdW d    d S 1 sw   Y  d S )Nz./simple-examples/data/ptb.z.txtr)   r   r   zInvalid gram lengthr   r   c                       g | ]	}j | qS r   r   getr*   r'   ZUNKr   r   r   r-          z'Imikolov._load_anno.<locals>.<listcomp>r   r	   c                    r>   r   r?   rA   rB   r   r   r-      rC   r   zUnknow data type)datar0   r1   r   r   r2   r   r   r   r"   r#   r9   r8   appendtupleAssertionError)r   r;   filenamer$   r&   iZsrc_seqZtrg_seqr   rB   r   r      s2   


 
"zImikolov._load_annoc                 C   s   t dd | j| D S )Nc                 S   s   g | ]}t |qS r   )nparray)r*   dr   r   r   r-      s    z(Imikolov.__getitem__.<locals>.<listcomp>)rF   rD   )r   idxr   r   r   __getitem__   s   zImikolov.__getitem__c                 C   s
   t | jS N)r9   rD   r,   r   r   r   __len__   s   
zImikolov.__len__)Nr   r   r   r   TrO   )
__name__
__module____qualname____doc__r   r(   r   r   rN   rP   r   r   r   r   r      s    8

'r   )r   r0   numpyrJ   Zpaddle.dataset.commonr   Z	paddle.ior   __all__r   r   r   r   r   r   r   <module>   s   