o
    "j                     @   s   d Z ddlZddlZddlZddlmZ g ZdZdZ	G dd dZ
ddd	ZdddZdd Zeddddde
jfddZeddddde
jfddZeddddddd ZdS )z
imikolov's simple dataset.

This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators.
    N)
deprecatedz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgzZ 30177ea32e27c525793142b6bf2c8e2dc                   @   s   e Zd ZdZdZdS )DataType      N)__name__
__module____qualname__NGRAMSEQ r   r   X/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/dataset/imikolov.pyr   #   s    r   c                 C   sb   |d u r	t t}| D ]#}|  D ]
}||  d7  < q|d  d7  < |d  d7  < q|S )Nr   <s><e>)collectionsdefaultdictintstripsplit)f	word_freqlwr   r   r   
word_count(   s   
r   2   c                    s   d}d}t tjjtjjjdtjjjO}|	|}|	|}t
|t
|}d|v r/|d=  fdd| D }t|dd d	}tt| \}}	ttt|tt|}
t||
d< W d
   |
S 1 shw   Y  |
S )z
    Build a word dictionary from the corpus,  Keys of the dictionary are words,
    and values are zero-based IDs of these words.
    $./simple-examples/data/ptb.train.txt$./simple-examples/data/ptb.valid.txtimikolov<unk>c                    s   g | ]
}|d   kr|qS )r   r   ).0xmin_word_freqr   r   
<listcomp>H   s    zbuild_dict.<locals>.<listcomp>c                 S   s   | d  | d fS )Nr   r   r   )r   r   r   r   <lambda>J   s    zbuild_dict.<locals>.<lambda>)keyN)tarfileopenpaddledatasetcommondownloadr   URLMD5extractfiler   itemssortedlistzipdictrangelen)r!   Ztrain_filenameZtest_filenametfZtrainfZtestfr   Zword_freq_sortedwords_word_idxr   r    r   
build_dict5   s,   


r9   c                    s    fdd}|S )Nc                  3   sX   t tjjtjjjdtjjj} | 	}d  |D ]z}t
jkrbdks.J ddg|   dg }t|kra fdd|D }tt|d	 D ]}t|| | V  qSqt
jkr|  } fd
d|D }d g| }|d g }dkrt|krq||fV  qtdW d    d S 1 sw   Y  d S )Nr   r   zInvalid gram lengthr   r   c                       g | ]} | qS r   getr   r   UNKr8   r   r   r"   c       z2reader_creator.<locals>.reader.<locals>.<listcomp>r   c                    r;   r   r<   r>   r?   r   r   r"   h   rA   r   zUnknown data type)r%   r&   r'   r(   r)   r*   r   r+   r,   r-   r   r	   r   r   r4   r3   tupler
   AssertionError)r5   r   r   iZsrc_seqZtrg_seq	data_typefilenamenr8   )r@   r   readerS   s>   


"zreader_creator.<locals>.readerr   )rG   r8   rH   rF   rI   r   rE   r   reader_creatorR   s   rJ   z2.0.0zpaddle.text.datasets.Imikolovr   z>Please use new dataset API which supports paddle.io.DataLoader)ZsinceZ	update_tolevelreasonc                 C      t d| ||S )a  
    imikolov training set creator.

    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Training reader creator
    :rtype: callable
    r   rJ   r8   rH   rF   r   r   r   traint      rP   c                 C   rM   )a  
    imikolov test set creator.

    It returns a reader creator, each sample in the reader is a word ID
    tuple.

    :param word_idx: word dictionary
    :type word_idx: dict
    :param n: sliding window size if type is ngram, otherwise max length of sequence
    :type n: int
    :param data_type: data type (ngram or sequence)
    :type data_type: member variable of DataType (NGRAM or SEQ)
    :return: Test reader creator
    :rtype: callable
    r   rN   rO   r   r   r   test   rQ   rR   c                   C   s   t jjtdt d S )Nr   )r'   r(   r)   r*   r+   r,   r   r   r   r   fetch   s   rS   )N)r   )__doc__r   r%   Zpaddle.dataset.commonr'   Zpaddle.utilsr   __all__r+   r,   r   r   r9   rJ   r	   rP   rR   rS   r   r   r   r   <module>   sB   

"