o
    *jn                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ dddZdd	d
ZdddZG dd dZG dd dZG dd dZG dd dejjjZG dd dejjjZG dd dejjjZdS )    N)punctuation_standardization)InputExample)tqdm)print_rank_0Fc              
   C   sR   ddddddddd	d
	}|  dd} |  dd} | D ]
\}}|  ||} q| S )Nz-lrb-z-rrb-z-lsb-z-rsb-z-lcb-z-rcb-z&amp;z&lt;z&gt;)	()[]{}&<>ZUNK[UNK]z<unk>replaceitemsstring	is_targetZ	_tok_dictkeyvalue r   q/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.pygigaword_detokenize   s   r   c                 C   s   ddddddd}|s|  dd	} n|  dd
} | D ]
\}}|  ||} q|  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} | S )Nz-LRB-z-RRB-z-LSB-z-RSB-z-LCB-z-RCB-)r   r   r   r	   r
   r   z<S_SEP> z[SEP]z''"z```'z n'tzn'tz 'sz'sz 'dz'dz 'llz'llr   r   r   r   r   cnndm_detokenize*   s(   r   c                 C   s   |  dd} |  dd} | S )NZ_UNKr   z<blank>[MASK])r   )r   r   r   r   r   blanklm_detokenizeC   s   r!   c                   @      e Zd Zdd Zdd ZdS )SummmaryProcessorc                 C   s   || _ || _|| _d S N)taskdata_dir	tokenizer)selfr%   r&   r'   r   r   r   __init__K   s   
zSummmaryProcessor.__init__c                 C   s0  |dkrd}n|dkrd}n|dkrd}nt |td| j d| d| j  | jdkr0t}n
| jd	kr8t}nd }g g }}ttj	| j| d
dd"}|D ]}|
 }t|}|ra||n|}|| qQW d    n1 ssw   Y  ttj	| j| ddd$}|D ]}|
 }t|}|r||ddn|}|| qW d    n1 sw   Y  t|t|ksJ g }tt||D ]O\}	\}
}|	d d dkrtd|	d  d d||	f }d| j| j|ji}t||
||d}|	dk rt|
d|d|d df || q|S )Ntraindevvaltest	Creating - dataset from gigawordcnn_dm.sourceutf-8encoding.targetT)r       N  r   	Complete 	 examples%s-%srefguidtext_atext_bmeta
   )NotImplementedErrorr   r%   r&   r   r   openospathjoinstripr   appendlen	enumeratezipr'   	DecodeIdsEncodeAsIdstokenizationr   encode)r(   splitfilenamedetokenizersource_textstarget_textsfilelineexample_listidxsource_texttarget_textr?   rB   exampler   r   r   create_examplesP   s   


	
z!SummmaryProcessor.create_examplesN__name__
__module____qualname__r)   r^   r   r   r   r   r#   I   s    r#   c                   @   r"   )SQuADProcessorc                 C      || _ || _d S r$   r&   r'   r(   r&   r'   r   r   r   r)         
zSQuADProcessor.__init__c                 C   s  |dkrd}n|dkrd}n|dkrd}nt |td| d| j  g }d	}ttj| j|d
d}}t|}|D ]m}|d D ]f}|d }	|d D ][}
|
d }dd |
d D }dd |
d D }|D ]@}d||f }|| ||| j	
| j	|jd}t||	|d}|dk rt|	d
|d
|d d
f || |d7 }qfqLqBq<W d    n1 sw   Y  tdt| d|  |S )Nr*   z
train.jsonr+   zdev.jsonr-   z	test.jsonzCreating SQuAD-r0   r   r4   r5   
paragraphscontextZqasquestionc                 S   s   h | ]}|d  qS )textr   .0answerr   r   r   	<setcomp>       z1SQuADProcessor.create_examples.<locals>.<setcomp>answersc                 S   s   i | ]	}|d  |d qS )rk   answer_startr   rl   r   r   r   
<dictcomp>   s    z2SQuADProcessor.create_examples.<locals>.<dictcomp>r<   )rr   rn   rj   r=   )r?   r@   rB   rC   r=   r8   r.   z examples for )rD   r   r&   rE   rF   rG   rH   jsonloadr'   rN   rO   rP   r   rQ   rJ   rK   )r(   rR   rS   rY   rZ   rW   Zdatasetrh   Z	paragraphri   Zqarj   rq   Zanswer_startsrn   r?   rB   r]   r   r   r   r^      sl   



$zSQuADProcessor.create_examplesNr_   r   r   r   r   rc          rc   c                   @   r"   )XSumProcessorc                 C   rd   r$   re   rf   r   r   r   r)      rg   zXSumProcessor.__init__c              	   C   s  |dkrd}n|dkrd}n|dkrd}nt |td| d| j  ttj| jddd	}t|}W d    n1 s@w   Y  || }g g }}t	|D ]\}}ttj| j| d
dd	y}d g }}	d\}
}|D ]2}|
 }|dr|d ur|dkrd|	}
n	|dkrd|	}|dd }g }	qq|r|	| qq|d ur|dkrd|	}
n	|dkrd|	}||
 || |d d dkrtd|d  d W d    n1 sw   Y  qRt|t|ksJ g }t	t||D ]P\}\}
}|d d dkrtd|d  d d||f }d| j| j|ji}t||
||d}|dk rEt|
d|d|d df || q|S )Nr*   r+   Z
validationr-   zCreating XSUM-r0   z(XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.jsonr4   r5   z.summary)NNz[SN]ZRESTBODY zFIRST-SENTENCE   r8   i  r   r:   r;   r9   r<   r=   r>   rC   )rD   r   r&   rE   rF   rG   rH   rt   ru   rL   rI   
startswithrJ   rK   rM   r'   rN   rO   rP   r   rQ   )r(   rR   r   rW   Zid_listrU   rV   irZ   Z	sentencesr[   r\   rX   rY   r?   rB   r]   r   r   r   r^      s   









zXSumProcessor.create_examplesNr_   r   r   r   r   rw      rv   rw   c                   @   $   e Zd Zdd Zdd Zdd ZdS )Seq2SeqDatasetc                 C   s   || _ |j |j| _| _|j|j| _| _|| _|| _	|| _
| jdv r/t| j| j|| _n| jdv r<t| j|| _n| jdv rIt| j|| _nt| j|}|| _dd |D | _tdt| j d| d d S )	N)r1   r2   cnn_dm_original)xsum)squad_generationc                 S   s   i | ]}|j |qS r   )r?   )rm   r]   r   r   r   rs     rp   z+Seq2SeqDataset.__init__.<locals>.<dictcomp>Return rx   r;   )argsr%   lowerr&   src_seq_lengthtgt_seq_lengthmax_src_lengthmax_tgt_lengthrR   r'   dataset_namer#   	processorrw   rc   rD   r^   rY   examplesr   rK   )r(   r   rR   r'   rY   r   r   r   r)     s&   



 zSeq2SeqDataset.__init__c                 C   
   t | jS r$   rK   rY   r(   r   r   r   __len__#     
zSeq2SeqDataset.__len__c                 C   s  | j | }| jdj}| jjrdnd}| j|j}| jdj}| jdj}| jdj}| jdv rk|j|j}	}
| j	d|	 j
}||g| j	d	j
 }t|| jt| krf|d | jt|  }|| }n| jd
kr|j}	|jd |jd }
}| j	|	 d j
}| j	d| j
}t|| jt| d kr| jt| d }| j	d| j
}dd }|||}t|dkrtd| d |d | }nt|d |d  d}||||  }|g| |g | }ntt|| jk r||g| jt|   }t|}ttt|}dgt| }||}| jdkr| j	d|
 j
}||g }t|| jkr>|d | j }dgt| }t|| jk re|dg| jt|  7 }||g| jt|  7 }||g |d d  }dgt| | }dgt| | }||gt| 7 }| jjr|dgt| 7 }n|ttdt|d 7 }||g}tj|tjdtj|tjdtj|tjdtj|tjdtj|tjd|jd}|S ||g }||g }|dg }||g}tj|tjdtj|tjdtj|tjd|jd}|S )NZENCZsMASKMASKpadsopeop)r1   r2   r   r   rx   z	 Content:r   rj   rn   z
 Question:z	 Answer:    c                 S   sL   g }t t| D ]}| | |d kr#| ||t|  |kr#|| q|S )Nr   )rangerK   rJ   )Zmylistpatternmatchesr|   r   r   r   
sub_finderF  s   
z.Seq2SeqDataset.__getitem__.<locals>.sub_finderr   zAnswer z not exists in the source textr*   r8   Zdtyperk   targetattention_mask	loss_maskposition_iduidrk   r   r   r   )rY   r'   get_commandIdr   	task_maskr%   r@   rA   rO   rP   rK   r   rB   rstripprintmaxrD   listr   indexrR   r   Zno_block_positionnparrayint64r?   )r(   rZ   r]   Zcls_id
mask_tokenmask_idpad_idsop_ideop_idr[   r\   source_tokenspromptrn   Zanswer_tokensr   Zanswer_patternr   Zanswer_indicesstart_indexsepposition_idsblock_position_idsmask_posZtarget_tokensr   tokens
target_idssampler   r   r   __getitem__&  s   











zSeq2SeqDataset.__getitem__Nr`   ra   rb   r)   r   r   r   r   r   r   r~     s    r~   c                   @   r}   )ExtractionDatasetc                 C   s  || _ |j |j}}|j|j| _| _|| _|| _	|dkr"d}n|dkr)d}n|dkr0d}nt
|td| d| d|  || _g g }}ttj|| dd	d
}	|	D ]}
|
 }
||
 qZW d    n1 spw   Y  ttj|| dd	d
}	|	D ]}
|
 }
||
 qW d    n1 sw   Y  i g | _| _tt||D ]5\}\}}|d d dkrtd|d  d d||f }d|i}t||||d}|| j|< | j| qtdt| j d| d d S )Nr*   r+   validr-   r.   r/   r0   r3   r4   r5   r7   r8   r9   r   r:   r;   r<   r=   r>   r   rx   )r   r%   r   r&   r   r   r   r   rR   r'   rD   r   r   rE   rF   rG   rH   rI   rJ   r   rY   rL   rM   r   rK   )r(   r   rR   r'   r%   r&   rS   rU   rV   rW   rX   rZ   r[   r\   r?   rB   r]   r   r   r   r)     sb   

 zExtractionDataset.__init__c                 C   r   r$   r   r   r   r   r   r     r   zExtractionDataset.__len__c                    s  | j | }|j|j}}d}| j|j | jdj}| jdj}| jdj}dd }	| j|j}
|d}|	|
| j	|}
t
|
}ttt
|
}dgt
|
 }| jd	kr( fd
dt|
D }t
|t
|ksoJ |
}dgt
|
 }dgt
|
 }t|D ]D\}}|| }| jd| j}||g| 7 }|||g 7 }|dgt
|d  7 }||gt
|d  7 }|dd tt
|d D 7 }q|	|| j	| j |}|	|| j	| j |}|	|| j	| j d}|	|| j	| j d}|	|| j	| j d}||g}tj|tjdtj|tjdtj|tjdtj|tjdtj|tjd|jd}|S |
|g }|
 }||g }|dg }||g}tj|tjdtj|tjdtj|tjd|jd}|S )Nr   r   r   r   c                 S   6   t | |kr| d | } | S | |g|t |    } | S r$   rK   rk   max_lenr   r   r   r   pad_to  
   z-ExtractionDataset.__getitem__.<locals>.pad_to|r   r*   c                       g | ]
\}}| kr|qS r   r   rm   r|   xr   r   r   
<listcomp>      z1ExtractionDataset.__getitem__.<locals>.<listcomp>rx   r8   c                 S      g | ]}|d  qS r8   r   rm   r|   r   r   r   r         r   r   r   )rY   r@   rA   r'   r   r   rO   rP   rR   r   rK   r   r   rL   r   r   r   r   r?   r   )r(   rZ   r]   r[   r\   r   r   r   r   r   r   
masked_tgtr   r   r   mask_positionsr   r   r   r|   r   tgt_text
tgt_tokensr   r   r   r   r     s   








zExtractionDataset.__getitem__Nr   r   r   r   r   r     s    *r   c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
BlankLMDatasetc                 C   s>  || _ |j |j}}|j|j| _| _|| _|j	dksJ || _
|dkr)d}n|dkr0d}n|dkr7d}nt|td| d| d|  || _t}g g }}	ttj|| d	d
d}
|
D ]}| }|ro||n|}|	| qcW d    n1 sw   Y  |dkrttj|d|jddd
d}
|
D ]}| }|r||n|}|| qW d    n1 sw   Y  n|	}i g | _| _tt||	D ]5\}\}}|d d dkrtd|d  d d||f }d|i}t||||d}|| j|< | j| qtdt| j d| d t|j| _d S )NZBertWordPieceTokenizerr*   r+   r   r-   r.   r/   r0   z.txtr4   r5   zblank/test.maskratioz.1fz.blankr8   r9   r   r:   r;   r<   r=   r>   r   rx   )r   r%   r   r&   r   r   r   r   rR   Ztokenizer_typer'   rD   r   r   r!   rE   rF   rG   rH   rI   rJ   blank_maskratior   rY   rL   rM   r   rK   randomRandomseed)r(   r   rR   r'   r%   r&   rS   rT   rU   rV   rW   rX   rZ   r[   r\   r?   rB   r]   r   r   r   r)   
  sx   

zBlankLMDataset.__init__c                 C   r   r$   r   r   r   r   r   r   A  r   zBlankLMDataset.__len__c                    s  | j | }|j|j}}| jjrdnd}| j|j | jdj}| jdj}| jdj}| jdv r>| 	|\}	}
|	}dd }| j
d	| j}||| j|}t|}ttt|}d
gt| }| jdv r1 fddt|D }t|t|
ksJ |}d
gt| }d
gt| }t|D ]D\}}|
| }| j
d	| j}||g| 7 }|||g 7 }|dgt|d  7 }||gt|d  7 }|dd tt|d D 7 }q| jt| j| jj  }||||}||||}|||d
}|||d
}|||d
}||g}tj|tjdtj|tjdtj|tjdtj|tjdtj|tjd|jd}|S ||g }| }||g }|dg }||g}tj|tjdtj|tjdtj|tjd|jd}|S )NZgMASKr   r   r   r   )r*   r+   c                 S   r   r$   r   r   r   r   r   r   P  r   z*BlankLMDataset.__getitem__.<locals>.pad_torx   r   c                    r   r   r   r   r   r   r   r   ^  r   z.BlankLMDataset.__getitem__.<locals>.<listcomp>r8   c                 S   r   r   r   r   r   r   r   r   m  r   r   r   r   )rY   r@   rA   r   r   r'   r   r   rR   	mask_textrO   rP   r   rK   r   r   rL   intr   r   r   r   r?   r   )r(   rZ   r]   r[   r\   r   r   r   r   
masked_srcr   r   r   r   r   r   r   r   r   r   r|   r   r   r   
max_lengthr   r   r   r   r   D  s   







zBlankLMDataset.__getitem__c                 C   s   |  }| jj}t|}t| jt|t|| }dg }}t	|D ]'\}}	|dks6|	||d  d kr;|
d |d  d||	  7  < d||	< q$t	|D ]\}}
|dkre|
dkre||d  dkreqP|d|
 7 }qP||fS )Nr   r   r8   r   rx   r    )rR   r   r   rK   sortedr   r   r   r   rL   rJ   )r(   rk   r   Z
mask_rationindicesr   r   r|   rZ   tokenr   r   r   r     s   


 zBlankLMDataset.mask_textN)r`   ra   rb   r)   r   r   r   r   r   r   r   r     s
    7Jr   )F)rF   r   rt   numpyr   ZtorchZtorch.utils.dataZdata_utils.corporar   Ztasks.data_utilsr   r   utilsr   r   r   r!   r#   rc   rw   dataZDatasetr~   r   r   r   r   r   r   <module>   s(   


@:H |