o
    #j                     @   s`   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 g Z
dZdZG dd de	ZdS )    N)_check_exists_and_download)Datasetz6https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gzZ 7c2ac02c03563afcf9b574c7e56c153ac                   @   sB   e Zd ZdZdddZdd	 Zd
d Zdd Zdd Zdd Z	dS )Imdba  
    Implementation of `IMDB <https://www.imdb.com/interfaces/>`_ dataset.

    Args:
        data_file(str): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        mode(str): 'train' 'test' mode. Default 'train'.
        cutoff(int): cutoff number for building word dictionary. Default 150.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True

    Returns:
        Dataset: instance of IMDB dataset

    Examples:

        .. code-block:: python

            >>> # doctest: +TIMEOUT(75)
            >>> import paddle
            >>> from paddle.text.datasets import Imdb

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, doc, label):
            ...         return paddle.sum(doc), label


            >>> imdb = Imdb(mode='train')

            >>> for i in range(10):
            ...     doc, label = imdb[i]
            ...     doc = paddle.to_tensor(doc)
            ...     label = paddle.to_tensor(label)
            ...
            ...     model = SimpleNet()
            ...     image, label = model(doc, label)
            ...     print(doc.shape, label.shape)
            [121] [1]
            [115] [1]
            [386] [1]
            [471] [1]
            [585] [1]
            [206] [1]
            [221] [1]
            [324] [1]
            [166] [1]
            [598] [1]

    Ntrain   Tc                 C   sj   |  dv sJ d| |  | _|| _| jd u r)|s J dt|ttd|| _| || _|   d S )N)r   testz(mode should be 'train', 'test', but got z>data_file is not set and downloading automatically is disabledZimdb)	lowermode	data_filer   URLMD5_build_work_dictword_idx
_load_anno)selfr
   r	   cutoffdownload r   Z/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/text/datasets/imdb.py__init__U   s   


zImdb.__init__c           
         s   t t}td}| |D ]}|D ]
}||  d7  < qq fdd| D }t|dd d}tt	| \}}t
tt	|tt|}	t||	d< |	S )	Nz/aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$   c                    s   g | ]
}|d   kr|qS )r   r   ).0xr   r   r   
<listcomp>s   s    z)Imdb._build_work_dict.<locals>.<listcomp>c                 S   s   | d  | d fS )Nr   r   r   )r   r   r   r   <lambda>u   s    z'Imdb._build_work_dict.<locals>.<lambda>)key<unk>)collectionsdefaultdictintrecompile	_tokenizeitemssortedlistzipdictrangelen)
r   r   Z	word_freqpatterndocword
dictionarywords_r   r   r   r   r   k   s   

zImdb._build_work_dictc              	   C   s   g }t | jB}| }|d urBt||jr2|||	 
dd tjd   | }|d usW d    |S W d    |S 1 sMw   Y  |S )Ns   
zlatin-1)tarfileopenr
   nextboolmatchnameappendextractfilereadrstrip	translatestringpunctuationencoder   split)r   r+   dataZtarftfr   r   r   r#   {   s,   


zImdb._tokenizec                    s   t dj d}t dj d}jd  g _g _|D ]}j fdd|D  jd q$|D ]}j fdd|D  jd	 q@d S )
NzaclImdb/z/pos/.*\.txt$z/neg/.*\.txt$r   c                       g | ]	}j | qS r   r   getr   wZUNKr   r   r   r          z#Imdb._load_anno.<locals>.<listcomp>r   c                    rB   r   rC   rE   rG   r   r   r      rH   r   )r!   r"   r	   r   docslabelsr#   r7   )r   Zpos_patternZneg_patternr,   r   rG   r   r      s   
zImdb._load_annoc                 C   s"   t | j| t | j| gfS N)nparrayrI   rJ   )r   idxr   r   r   __getitem__   s   "zImdb.__getitem__c                 C   s
   t | jS rK   )r*   rI   )r   r   r   r   __len__   s   
zImdb.__len__)Nr   r   T)
__name__
__module____qualname____doc__r   r   r#   r   rO   rP   r   r   r   r   r      s    
5r   )r   r!   r<   r1   numpyrL   Zpaddle.dataset.commonr   Z	paddle.ior   __all__r   r   r   r   r   r   r   <module>   s   