o
    #j2                     @   st   d dl Z d dlZd dlZd dlmZ d dlmZ g ZdZ	dZ
dZdZdZd	Zd
ZdZdZdZd ZG dd deZdS )    N)_check_exists_and_download)DatasetzBhttp://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gzZ 387719152ae52d60422c016e92a742fcz:http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txtZ ea7fb7d4c75cc6254716f0177a506baaz:http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txtZ 0d2977293bbb6cbefab5b0f97db1e77cz<http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txtZ d8c7f03ceb5fc2e5a0fa7503a4353751z1http://paddlemodels.bj.bcebos.com/conll05st%2FembZ bf436eb0faa1f6f9103017f8be57cdb7c                   @   s^   e Zd ZdZ						dddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd ZdS )	Conll05sta{	  
    Implementation of `Conll05st <https://www.cs.upc.edu/~srlconll/soft.html>`_
    test dataset.

    Note: only support download test dataset automatically for that
          only test dataset of Conll05st is public.

    Args:
        data_file(str): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        word_dict_file(str): path to word dictionary file, can be set None if
            :attr:`download` is True. Default None
        verb_dict_file(str): path to verb dictionary file, can be set None if
            :attr:`download` is True. Default None
        target_dict_file(str): path to target dictionary file, can be set None if
            :attr:`download` is True. Default None
        emb_file(str): path to embedding dictionary file, only used for
            :code:`get_embedding` can be set None if :attr:`download` is
            True. Default None
        download(bool): whether to download dataset automatically if
            :attr:`data_file` :attr:`word_dict_file` :attr:`verb_dict_file`
            :attr:`target_dict_file` is not set. Default True

    Returns:
        Dataset: instance of conll05st dataset

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import Conll05st

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, pred_idx, mark, label):
            ...         return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)


            >>> conll05st = Conll05st()

            >>> for i in range(10):
            ...     pred_idx, mark, label= conll05st[i][-3:]
            ...     pred_idx = paddle.to_tensor(pred_idx)
            ...     mark = paddle.to_tensor(mark)
            ...     label = paddle.to_tensor(label)
            ...
            ...     model = SimpleNet()
            ...     pred_idx, mark, label= model(pred_idx, mark, label)
            ...     print(pred_idx.item(), mark.item(), label.item())
            >>> # doctest: +SKIP('label will change')
            65840 5 1991
            92560 5 3686
            99120 5 457
            121960 5 3945
            4774 5 2378
            14973 5 1938
            36921 5 1090
            26908 5 2329
            62965 5 2968
            97755 5 2674

    NTc                 C   s  || _ | j d u r|sJ dt|ttd|| _ || _| jd u r.|s%J dt|ttd|| _|| _| jd u rE|s<J dt|tt	d|| _|| _
| j
d u r\|sSJ dt|ttd|| _
|| _| jd u rs|sjJ dt|ttd|| _| | j| _| | j| _| | j
| _|   d S )Nz>data_file is not set and downloading automatically is disabledZ	conll05stzCword_dict_file is not set and downloading automatically is disabledzCverb_dict_file is not set and downloading automatically is disabledzEtarget_dict_file is not set and downloading automatically is disabledz=emb_file is not set and downloading automatically is disabled)	data_filer   DATA_URLDATA_MD5word_dict_fileWORDDICT_URLWORDDICT_MD5verb_dict_fileVERBDICT_URLVERBDICT_MD5target_dict_fileTRGDICT_URLTRGDICT_MD5emb_fileEMB_URLEMB_MD5
_load_dict	word_dictpredicate_dict_load_label_dict
label_dict
_load_anno)selfr   r   r   r   r   download r   ]/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/text/datasets/conll05.py__init__j   sz   	






zConll05st.__init__c           	      C   s   i }t  }t|dR}t|D ]%\}}| }|dr&||dd   q|dr4||dd   qd}|D ]}||d| < |d7 }||d| < |d7 }q9||d< W d    |S 1 s_w   Y  |S )NrB-   I-r      O)setopen	enumeratestrip
startswithadd)	r   filenamedZtag_dictfilineindextagr   r   r   r      s*   




zConll05st._load_label_dictc                 C   sR   i }t |d}t|D ]
\}}||| < qW d    |S 1 s"w   Y  |S )Nr   )r&   r'   r(   )r   r+   r,   r-   r.   r/   r   r   r   r      s   
zConll05st._load_dictc              
      s  t | j}|d}|d}g | _g | _g | _tj|d'}tj|d}g }g }g }t	||D ]\}	}
|	
  }	|

   }
t|
dkr%tt|d D ]  fdd|D }|| qUt|dkrg }|d D ]}|dkr~|| qst|dd  D ]\ }d	}d
}g }d}|D ]s}|dkr|s|d	 q|dkr|r|d|  q|dkr|d|  d
}q|ddkr|ddkr|d|d }|d|  d
}q|ddkr|ddkr|d|d }|d|  d}qtd| | j| | j|   | j| qg }g }g }q4||	 ||
 q4W d    n	1 s;w   Y  W d    n	1 sKw   Y  |  |  |  d S )Nz2conll05st-release/test.wsj/words/test.wsj.words.gzz2conll05st-release/test.wsj/props/test.wsj.props.gz)fileobjr   c                    s   g | ]}|  qS r   r   ).0xr.   r   r   
<listcomp>   s    z(Conll05st._load_anno.<locals>.<listcomp>r#   -r$   F *r"   z*)()r    TzUnexpected label: %s)tarfiler&   r   extractfile	sentences
predicateslabelsgzipGzipFilezipr(   decodesplitlenrangeappendr'   findRuntimeErrorclose)r   tfZwfpfZ
words_fileZ
props_filer?   rA   Zone_segwordlabelZa_kind_lableZ	verb_listr4   ZlblZcur_tagZis_in_bracketZlbl_seqZ	verb_wordlr   r5   r   r      s   
 
 :zConll05st._load_annoc                    s   j | } j| } j| }t|}|d}dgt| }|dkr0d||d < ||d  }nd}|dkrCd||d < ||d  }	nd}	d||< || }
|t|d k rbd||d < ||d  }nd}|t|d k ryd||d < ||d  }nd} fdd|D } j|	tg| } j|tg| } j|
tg| } j|tg| } j|tg| } j|g| } fd	d|D }t	
|t	
|t	
|t	
|t	
|t	
|t	
|t	
|t	
|f	S )
NzB-Vr   r#   Zbosr!   Zeosc                    s   g | ]	} j |tqS r   )r   getUNK_IDXr3   wr   r   r   r6   :  s    z)Conll05st.__getitem__.<locals>.<listcomp>c                    s   g | ]} j |qS r   )r   rR   rT   rV   r   r   r6   C  s    )r?   r@   rA   rG   r0   r   rR   rS   r   nparray)r   idxZsentence	predicaterA   Zsen_lenZ
verb_indexmarkZctx_n1Zctx_n2Zctx_0Zctx_p1Zctx_p2Zword_idxZ
ctx_n2_idxZ
ctx_n1_idxZ	ctx_0_idxZ
ctx_p1_idxZ
ctx_p2_idxZpred_idxZ	label_idxr   rV   r   __getitem__  sT   



zConll05st.__getitem__c                 C   s
   t | jS )N)rG   r?   rV   r   r   r   __len__Q  s   
zConll05st.__len__c                 C   s   | j | j| jfS )aD  
        Get the word, verb and label dictionary of Wikipedia corpus.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import Conll05st

                >>> conll05st = Conll05st()
                >>> word_dict, predicate_dict, label_dict = conll05st.get_dict()

        )r   r   r   rV   r   r   r   get_dictT  s   zConll05st.get_dictc                 C   s   | j S )a  
        Get the embedding dictionary file.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import Conll05st

                >>> conll05st = Conll05st()
                >>> emb_file = conll05st.get_embedding()

        )r   rV   r   r   r   get_embeddingd  s   zConll05st.get_embedding)NNNNNT)__name__
__module____qualname____doc__r   r   r   r   r\   r]   r^   r_   r   r   r   r   r   '   s     D
II;r   )rB   r=   numpyrW   Zpaddle.dataset.commonr   Z	paddle.ior   __all__r   r   r	   r
   r   r   r   r   r   r   rS   r   r   r   r   r   <module>   s$   