o
    *Îj›4  ã                   @   sø   d Z ddlZddlZddlZddlmZmZmZ ddlZddl	Z
ddlZddlZddlmZ ddlmZ dd„ ZG dd	„ d	eƒZ	
ddd„Z		
			
	ddd„Zdd„ Z										ddd„Zdd„ Zdd„ ZG dd„ dƒZ	
	ddd„ZdS ) z Tasks data utility.é    N)ÚDictÚListÚOptional)Úmpu)Údefault_collatec                 C   s8   |   dd¡} t dd| ¡} tdƒD ]}|   dd¡} q| S )zDRemove new lines and multiple spaces and adjust end of sentence dot.Ú
ú z\s+é   z . z. )ÚreplaceÚreÚsubÚrange)ÚtextÚ_© r   úl/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/tasks/data_utils.pyÚ
clean_text   s
   r   c                   @   s€   e Zd ZdZ						ddee fdd„Zdd	„ Zd
d„ Zdd„ Z	e
deded  fdd„ƒZe
ded  deddfdd„ƒZdS )ÚInputExamplezIA raw input example consisting of one or two segments of text and a labelNéÿÿÿÿé   Úmetac	           	      C   sB   || _ || _|| _|| _|| _|| _|| _|r|| _dS i | _dS )a˜  Create a new InputExample.

        Args:
            guid: a unique textual identifier
            text_a: the sequence of text
            text_b: an optional, second sequence of text
            label: an optional label
            logits: an optional list of per-class logits
            meta: an optional dictionary to store arbitrary meta information
            idx: an optional numeric index
        N)ÚguidÚtext_aÚtext_bÚlabelÚlogitsÚidxÚnum_choicesr   )	Úselfr   r   r   r   r   r   r   r   r   r   r   Ú__init__*   s   zInputExample.__init__c                 C   s   t |  ¡ ƒS ©N)ÚstrÚto_json_string©r   r   r   r   Ú__repr__G   s   zInputExample.__repr__c                 C   s   t  | j¡}|S )z/Serialize this instance to a Python dictionary.)ÚcopyÚdeepcopyÚ__dict__)r   Úoutputr   r   r   Úto_dictJ   s   zInputExample.to_dictc                 C   s   t j|  ¡ dddd S )z)Serialize this instance to a JSON string.é   T)ÚindentÚ	sort_keysr   )ÚjsonÚdumpsr)   r#   r   r   r   r"   O   s   zInputExample.to_json_stringÚpathÚreturnc                 C   s8   t | dƒ}t |¡W  d  ƒ S 1 sw   Y  dS )z(Load a set of input examples from a fileÚrbN)ÚopenÚpickleÚload)r/   Úfhr   r   r   Úload_examplesS   s   $ÿzInputExample.load_examplesÚexamplesc                 C   s<   t |dƒ}t | |¡ W d  ƒ dS 1 sw   Y  dS )z&Save a set of input examples to a fileÚwbN)r2   r3   Údump)r7   r/   r5   r   r   r   Úsave_examplesY   s   "ÿzInputExample.save_examples)NNNNr   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r$   r)   r"   Ústaticmethodr!   r   r6   r:   r   r   r   r   r   '   s$    ø
ú r   Tc                 C   s@   d}|r|d7 }|r|r|d7 }|r|d7 }|s|r|d7 }|S ©Nr   r   r   )Ú
text_a_idsÚ
text_b_idsÚ
answer_idsÚadd_clsÚadd_sepÚ	add_pieceÚadd_eosZ
num_tokensr   r   r   Únum_special_tokens_to_add`   s   rH   Fc                 C   sˆ  |
d u r
|  d¡j}
|  d¡j}|  d¡j}|  d¡j}g }g }g }|r3| |¡ | d¡ | d¡ t| ƒ}| | ¡ | dg| ¡ | dg| ¡ |d urz|ra| |¡ | d¡ | d¡ t|ƒ}| |¡ | dg| ¡ | dg| ¡ |	r~dnd}t|ƒ|| krž|d }|d|… }|d|… }|d|… }|d u r¤dnd}|	r·| |¡ | |¡ | d¡ t|ƒ}dgt|ƒ }dgt|ƒ }ttt|ƒƒƒ}dgt|ƒ }|sß|d urp|  d¡j}|jsí| |
¡n|j	}| |¡ | |¡ | d¡ | |¡ | d¡ |d urft|ƒ}| |d d… ¡ | |g|d  ¡ | dg|d  ¡ | |g|d  ¡ |j
sJ| td	t|ƒd ƒ¡ n| dgt|ƒd  ¡ | |¡ | dgt|ƒ ¡ n
| d¡ | d¡ |t|ƒ }|dkr³| |g| ¡ | |g| ¡ | dg| ¡ | dg| ¡ | dg| ¡ | dg| ¡ | dg| ¡ |js»||g}|||||||fS )
NÚMASKÚeosZENCÚsepr   r   Úsopr   r*   )Úget_commandÚIdÚappendÚlenÚextendÚlistr   Zsentinel_tokenÚindexZmax_position_embeddingsZno_block_positionZ	masked_lm)rA   rB   rC   Úmax_seq_lengthÚ	tokenizerÚargsrD   rE   rF   rG   Úmask_idÚeos_idZcls_idZsep_idÚidsÚtypesÚpaddingsZ
len_text_aZ
len_text_bZ
eos_lengthZmax_seq_length_m1Zend_typerK   Ú
target_idsÚ
loss_masksÚposition_idsÚblock_position_idsÚsop_idÚmask_positionÚ
len_answerÚpadding_lengthr   r   r   Úbuild_input_from_idss   sž   










þÿþ









rd   c                 C   s0  |  d¡j}|  d¡j}|  d¡j}t| ƒ}g }	|  |¡}
t|ƒ}|g|d d…  }dg| }dg| }|
g| }ttd|d ƒƒ}|}dg| }|t|ƒ }|dkr‹| |g| ¡ | dg| ¡ | dg| ¡ | dg| ¡ | dg| ¡ | dg| ¡ | dg| ¡ ||g}|||||	||fS )NrI   rJ   rL   r   r   r   )rM   rN   rP   rS   rR   r   rQ   )Zenc_idsrC   rT   Zmax_dec_seq_lengthrU   rW   rX   r`   Zenc_lenÚmasksra   rb   rY   rZ   r[   r^   r_   r\   r]   rc   r   r   r   Úbuild_decoder_inputÕ   s2   




rf   c                 C   s2  t j| t jd}|t|ƒdœ}|durt j|t jd}||d< |dur/t j|t jd}||d< |dur?t j|t jd}||d< |durOt j|t jd}||d< |dur_t j|t jd}||d< |durot j|t jd}||d	< |	durt j|	t jd}	|	|d
< |
durt j|
t jd}
|
|d< |dur—||d< |S )zDConvert to numpy and return a sample consumed by the batch producer.©Zdtype)r   r   NrZ   Zpadding_maskÚpositionÚmaskÚtargetÚ
logit_maskZ
segment_idZ
prompt_posÚuid)ÚnpÚarrayÚint64Úint)rY   rZ   r[   Z	positionsre   r   Ú	unique_idrj   rk   Zsegment_idsZ
prompt_idsZids_npÚsampleZtypes_npZpaddings_npZpositions_npZmasks_npZ	target_npZlogit_mask_npr   r   r   Úbuild_sample÷   s:   rs   c                 C   sJ   t  |¡| d< t  |¡| d< t  |¡| d< t  |¡| d< t  |¡| d< | S )NÚdec_textÚdec_positionZdec_maskÚ
dec_targetÚdec_logit_mask)rm   rn   )rr   Zdec_idsru   Z	dec_masksrv   rw   r   r   r   Úbuild_decoder_sample#  s   rx   c                 C   s’  dd„ | D ƒ}dd„ | D ƒ}dd„ }t |d jƒdkr`ttt |ƒƒ}t|ƒ}t|ƒD ]5\}}| ¡ D ]\}}	|dkrB||	|ƒ||< q2|	||< q2tjd	g||  dg|||    tj	d
|d< q*d|d v r²dd„ |D ƒ}| 
|d ¡t |ƒkr²t|ƒ}t|ƒD ]1\}}| ¡ D ]\}}	| d¡r˜||	|ƒ||< qˆtjd	g||  dg|||    tj	d
|d< q€t|ƒ}d| d v rÇdd„ | D ƒ}
|
|d< |S )Nc                 S   s   g | ]}d d„ |  ¡ D ƒ‘qS )c                 S   s   i | ]\}}|d kr||“qS ©rl   r   )Ú.0ÚkeyÚvaluer   r   r   Ú
<dictcomp>.  s    ÿz)my_collate.<locals>.<listcomp>.<dictcomp>)Úitems©rz   rr   r   r   r   Ú
<listcomp>.  s
    þ
ÿzmy_collate.<locals>.<listcomp>c                 S   ó   g | ]}|d  ‘qS )r   r   r   r   r   r   r€   1  ó    c                 S   s6   t | ƒ|k rt | g| dd… g|t | ƒ   ¡} | S r@   )rP   rm   Zconcatenate)ÚdataZ
choice_numr   r   r   Úpad_choice_dim3  s
   ÿz"my_collate.<locals>.pad_choice_dimr   r*   r   r   rg   Z	loss_maskrt   c                 S   s   g | ]}t |d  ƒ‘qS )rt   )rP   r   r   r   r   r€   G  s    Zdec_rl   c                 S   r   ry   r   r   r   r   r   r€   U  r‚   )rP   ÚshaperR   ÚmapÚmaxÚ	enumerater~   rm   rn   ro   ÚcountÚ
startswithr   )ÚbatchZ	new_batchZ	text_listr„   Zchoice_numsZmax_choice_numÚirr   r{   r|   Zuid_listr   r   r   Ú
my_collate-  sJ   þ
þ
€
ÿýr   c                   @   s   e Zd Zdd„ Zdd„ ZdS )ÚFakeDataloaderc                 C   s
   || _ d S r    )Ú	num_iters)r   r   r   r   r   r   \  s   
zFakeDataloader.__init__c                 c   s0    | j d urt| j ƒD ]}d V  qd S 	 d V  qr    )r   r   )r   r   r   r   r   Ú__iter___  s   €
ÿÿzFakeDataloader.__iter__N)r;   r<   r=   r   r   r   r   r   r   rŽ   Z  s    rŽ   c           
   
   C   sX   |rd\}}nt  ¡ }t  ¡ }tjjjj| |||d}tjjj| ||d||dt	d}	|	S )zDData loader. Note that batch-size is the local (per GPU) batch-size.)r   r   )Znum_replicasÚrankÚshuffleFT)Ú
batch_sizeÚsamplerr’   Únum_workersÚ	drop_lastZ
pin_memoryZ
collate_fn)
r   Zget_data_parallel_world_sizeZget_data_parallel_rankÚtorchÚutilsrƒ   ÚdistributedZDistributedSamplerZ
DataLoaderr   )
Zdatasetr“   r•   r–   r’   Z
only_rank0r‘   Z
world_sizer”   Zdata_loaderr   r   r   Úbuild_data_loaderh  s$   	

ÿø
rš   )T)NTFFTN)
NNNNNNNNNN)TF)r>   r%   r3   r   Útypingr   r   r   r-   Únumpyrm   r—   Ztorch.utils.dataZmegatron_utilr   Ztorch.utils.data.dataloaderr   r   Úobjectr   rH   rd   rf   rs   rx   r   rŽ   rš   r   r   r   r   Ú<module>   sP   ?
ú
öb#
ö,
-û