o
    #j                     @   st   d dl Z d dlZd dlZd dlmZ d dlmZ g Zg dZ	dZ
dZG dd dZG d	d
 d
ZG dd deZdS )    N)_check_exists_and_download)Dataset)         #   -   2   8   z3https://dataset.bj.bcebos.com/movielens%2Fml-1m.zipZ c4d9eecfca2ab87c1945afe126590906c                   @   0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )	MovieInfozM
    Movie id, title and categories information are stored in MovieInfo.
    c                 C   s   t || _|| _|| _d S N)intindex
categoriestitle)selfr   r   r    r   _/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/text/datasets/movielens.py__init__$   s   

zMovieInfo.__init__c                    s2   | j g fdd| jD fdd| j D gS )z/
        Get information from a movie.
        c                    s   g | ]} | qS r   r   ).0c)categories_dictr   r   
<listcomp>/   s    z#MovieInfo.value.<locals>.<listcomp>c                    s   g | ]} |   qS r   )lower)r   w)movie_title_dictr   r   r   0   s    )r   r   r   split)r   r   r   r   )r   r   r   value)   s   zMovieInfo.valuec                 C   s   d| j | j| jf S )Nz-<MovieInfo id(%d), title(%s), categories(%s)>)r   r   r   r   r   r   r   __str__3   s
   zMovieInfo.__str__c                 C   s   |   S r   )r    r   r   r   r   __repr__:      zMovieInfo.__repr__N__name__
__module____qualname____doc__r   r   r    r!   r   r   r   r   r      s    
r   c                   @   r   )UserInfozK
    User id, gender, age, and job information are stored in UserInfo.
    c                 C   s2   t || _|dk| _tt || _t || _d S )NM)r   r   is_male	age_tableagejob_id)r   r   genderr,   r-   r   r   r   r   C   s   

zUserInfo.__init__c                 C   s$   | j g| jrdndg| jg| jggS )z.
        Get information from a user.
        r   r   )r   r*   r,   r-   r   r   r   r   r   I   s
   zUserInfo.valuec                 C   s$   d| j | jrdndt| j | jf S )Nz/<UserInfo id(%d), gender(%s), age(%d), job(%d)>r)   F)r   r*   r+   r,   r-   r   r   r   r   r    T   s   zUserInfo.__str__c                 C   s   t | S r   )strr   r   r   r   r!   \   r"   zUserInfo.__repr__Nr#   r   r   r   r   r(   >   s    r(   c                   @   sD   e Zd ZdZ					dddZd	d
 Zdd Zdd Zdd ZdS )	Movielensa  
    Implementation of `Movielens 1-M <https://grouplens.org/datasets/movielens/1m/>`_ dataset.

    Args:
        data_file(str): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        mode(str): 'train' or 'test' mode. Default 'train'.
        test_ratio(float): split ratio for test sample. Default 0.1.
        rand_seed(int): random seed. Default 0.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True

    Returns:
        Dataset: instance of Movielens 1-M dataset

    Examples:

        .. code-block:: python

            >>> # doctest: +TIMEOUT(75)
            >>> import paddle
            >>> from paddle.text.datasets import Movielens

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, category, title, rating):
            ...         return paddle.sum(category), paddle.sum(title), paddle.sum(rating)


            >>> movielens = Movielens(mode='train')

            >>> for i in range(10):
            ...     category, title, rating = movielens[i][-3:]
            ...     category = paddle.to_tensor(category)
            ...     title = paddle.to_tensor(title)
            ...     rating = paddle.to_tensor(rating)
            ...
            ...     model = SimpleNet()
            ...     category, title, rating = model(category, title, rating)
            ...     print(category.shape, title.shape, rating.shape)
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []
            [] [] []

    Ntrain皙?r   Tc                 C   s~   |  dv sJ d| |  | _|| _| jd u r)|s J dt|ttd|| _|| _|| _tj	
| |   |   d S )N)r2   testz(mode should be 'train', 'test', but got z>data_file is not set and downloading automatically is disabledZ	sentiment)r   mode	data_filer   URLMD5
test_ratio	rand_seednprandomseed_load_meta_info
_load_data)r   r6   r5   r9   r:   downloadr   r   r   r      s$   


zMovielens.__init__c              
   C   s  t d}i | _i | _i | _i | _t| j}|	 D ]}t
|tjs&J t }t }|dR}t|D ]E\}}|jdd}| d\}	}
}|d}|D ]}|| qQ||
d}
t|	||
d| jt|	< |
 D ]	}||  qqq6W d    n1 sw   Y  t|D ]	\}}|| j|< qt|D ]	\}}|| j|< q|d	,}|D ]!}|jdd}| d\}}}}}t||||d
| jt|< qW d    n1 sw   Y  qW d    d S 1 sw   Y  d S )Nz^(.*)\((\d+)\)$zml-1m/movies.datlatinencoding::|r   )r   r   r   zml-1m/users.dat)r   r.   r,   r-   )recompile
movie_infor   r   	user_infozipfileZipFiler6   infolist
isinstanceZipInfosetopen	enumeratedecodestripr   addmatchgroupr   r   r   r(   )r   patternpackageinfoZtitle_word_setZcategories_setZ
movie_fileilineZmovie_idr   r   r   r   Z	user_fileuidr.   r,   Zjob_r   r   r   r>      sT   

"zMovielens._load_meta_infoc           
   
   C   s  g | _ | jdk}t| jt}|dV}|D ]K}|jdd}tj | j	k |krb|
 d\}}}}t|}t|}t|d d }| j| }| j| }	| j |	 || j| j |gg  qW d    n1 smw   Y  W d    d S W d    d S 1 sw   Y  d S )Nr4   zml-1m/ratings.datrA   rB   rD      g      @)datar5   rJ   rK   r6   rP   rR   r;   r<   r9   rS   r   r   floatrH   rI   appendr   r   r   )
r   Zis_testrX   Zratingr[   r\   Zmov_idr]   Zmovusrr   r   r   r?      s8   


"zMovielens._load_datac                 C   s   | j | }tdd |D S )Nc                 S   s   g | ]}t |qS r   )r;   array)r   dr   r   r   r      s    z)Movielens.__getitem__.<locals>.<listcomp>)r_   tuple)r   idxr_   r   r   r   __getitem__   s   
zMovielens.__getitem__c                 C   s
   t | jS r   )lenr_   r   r   r   r   __len__   s   
zMovielens.__len__)Nr2   r3   r   T)	r$   r%   r&   r'   r   r>   r?   rg   ri   r   r   r   r   r1   `   s    9
'r1   )rF   rJ   numpyr;   Zpaddle.dataset.commonr   Z	paddle.ior   __all__r+   r7   r8   r   r(   r1   r   r   r   r   <module>   s   "