o
    *jKF                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlZ	ddl
Z
ddlZ
ddlmZmZ ddlmZ ddlmZ dd	lmZ G d
d de
jjjZG dd dZdd Z		dddZdd Zdd Zdd Zdd Zdd Z dS )z&parses arguments and preps data loader    N)bisect_right)
accumulate)mpuprint_rank_0   )
data_utils)ConstructBlockStrategy)make_tokenizerc                       s@   e Zd Z			d fdd	Zdd Zedd	 Zd
d Z  ZS )MultiTaskDatasetT皙?@ c                    s   t t|   || _|| _|| _| _dd |D | _t	 fdd| jD | _
t| j| _tt| j| _| jrIttt| j| j| j
 nttt| j| j |  j
| j
   _
d S )Nc                 S      g | ]}t |qS  )len).0datasetr   r   j/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/configure_data.py
<listcomp>-       z-MultiTaskDataset.__init__.<locals>.<listcomp>c                    s   g | ]	}t |  qS r   )min)r   length	max_limittemperaturer   r   r   /   s    )superr
   __init__tasksdatasetsreweightr   Zlensnparrayweightssum	total_lenlistr   cumulative_lensr   zip)selfr   r   r   r   r   	__class__r   r   r   "   s   zMultiTaskDataset.__init__c                 C   s
   | j d S )Ni  )r#   )r'   r   r   r   __len__8   s   
zMultiTaskDataset.__len__c                 C   s   | d }| d }| d }| d }| d }| d }t |jdkr4|| }|| }|| }|| }|| }n|| }|jsB|t |}|||||dS )	NtextZ
logit_masktargetmaskpositionlabel   )r+   r,   	loss_maskposition_idattention_mask)r   shaperepeat)datar+   r1   r,   r3   r2   r/   r   r   r   pet_wrapper;   s*   
zMultiTaskDataset.pet_wrapperc                    s   | j r=t| tjj fddtdD d  jtt| j	| j
d}| j	| } tt|}| j	| | }nt| j|}|dkrJ|}n	|| j|d   }| j	| | }| |}|S )Nc                    s   g | ]}  d dqS )r   l    )randint)r   _rngr   r   r   Y   s    z0MultiTaskDataset.__getitem__.<locals>.<listcomp>   )seed)pr   r   )r   randomRandomr   ZRandomStaterangechoiceZaranger   r   r!   r   r%   r7   )r'   idxZdataset_idxr   Z
sample_idxitemr   r:   r   __getitem__U   s$   


zMultiTaskDataset.__getitem__)Tr   r   )	__name__
__module____qualname__r   r*   staticmethodr7   rE   __classcell__r   r   r(   r   r
       s    
r
   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )
DataConfigNc                    s$   t t|   |d u ri }|| _d S N)r   rK   r   defaults)r'   rM   r(   r   r   r   l   s   
zDataConfig.__init__c                 C   s*   t j dkrtd | | t||S )Nr   zconfiguring data)torchdistributedget_rankprintapply_defaultsmake_loaders)r'   args	tokenizerr   r   r   applyr   s   

zDataConfig.applyc                 K   s    |  D ]	\}}|| j|< qd S rL   )itemsrM   )r'   kwargskvr   r   r   set_defaultsx   s   zDataConfig.set_defaultsc                 C   s:   | j  D ]\}}|dd}t||st||| qd S )N-r9   )rM   rW   replacehasattrsetattr)r'   rT   rY   rZ   r   r   r   rR   |   s   
zDataConfig.apply_defaultsrL   )rF   rG   rH   r   rV   r[   rR   rJ   r   r   r(   r   rK   j   s
    rK   c           	      C   s4  d}| j r| j}t| jd | j| j| j| j| j|| j	| j
dkp"| jdkd
}t dkrp|j}|dj}||djks>J |}|}| j}|| dkrU|d7 }|| dksKtd||| | td| tj||g}ntjddg}tjj|t t d	 |d  }|d  }||| _| _|S )
Nr           )Zadd_block_symbols	cache_diradd_sentinel_tokenZadd_task_maskZadd_decoder_maskZeospadr   z=> padded vocab (size: {}) with {} dummy tokens (new size: {})z!> found end-of-document token: {}group)sentinel_tokenZmax_position_embeddingsr	   tokenizer_typetokenizer_path
vocab_sizetokenizer_model_typeblock_lmra   	task_maskblock_mask_probcontext_mask_ratior   get_model_parallel_rank
num_tokensZget_commandZIdZmake_vocab_size_divisible_byr   formatrN   cudaZ
LongTensorrO   	broadcastZget_model_parallel_src_rankZget_model_parallel_grouprD   	eod_token)	rT   rb   rU   rp   rt   beforeafterZmultipleZtoken_countsr   r   r   prepare_tokenizer   sV   

rw   Fc                 C   s  t jjt d}t jjt d}|jd ur&||j }||j }||j }|dk}	|jr:tj	
t| ||||}
n4|rLtj	j| d||j |j d}nt jj| }|	}|	retj	j||||||jd}
n	t jj|||}
d }|rt|||jfi d|jd|jd|jd	|jd
|jd|jd|jd|jd|jd|jd|j  d|j! d|j"d|j#d|j$d|j%d|j&j'}t jjj(| |
|j)d|d}|S )Nrd   r   T)replacementZnum_samples)gradient_accumulation_steps	bert_probgap_sentence_probgap_sentence_ratiogpt_infill_probZaverage_block_lengthgpt_min_ratiorm   rn   short_seq_probsingle_span_probZshuffle_blocksZblock_position_encodingrf   encoder_decoderrl   random_position	masked_lm)batch_samplernum_workersZ
pin_memory
collate_fn)*rN   rO   get_world_sizer   get_data_parallel_grouprP   loader_scatterZtransformer_xlr   ZsamplersZDistributedSequentialSamplerr   ZRandomSamplertrain_itersry   utilsr6   ZSequentialSamplerZDistributedBatchSamplerZBatchSamplerr   
seq_lengthrz   r{   r|   r}   Zavg_block_lengthr~   rm   rn   r   r   Zno_shuffle_blockZno_block_positionrf   r   rl   r   r   Zconstruct_blocksZ
DataLoaderr   )r   rU   
batch_sizeZ	num_itersrT   shuffleblock_collate
world_sizerankrO   r   ZsamplerZ	drop_lastr   Zdata_loaderr   r   r   make_data_loader   s   



	
r   c                 C   s   ddl }| j| j| jdt| jd| j| j d | jdkd}|jj	| j
fi |}d|d< | jdur7| j|d< | jdurA| j|d	< d}| jdurS|jj	| jfi |}d}| jdure|jj	| jfi |}|j| j|| j| j| j| jd
}|||f|fS )z3Load train/val/test dataset from shuffled TFRecordsr   NTr   )r   max_seq_lenmax_preds_per_seqtrainr   r=   Zthreaded_dlFr   r   r   )ra   )Zdata_utils.tf_dlr   r   r   maxr   r=   r   Ztf_dlZTFRecordDataLoader
train_dataeval_seq_lengtheval_max_preds_per_seq
valid_data	test_datar	   rg   rh   ri   rj   ra   )rT   r   data_set_argsr   validtestrU   r   r   r   make_tfrecord_loaders   sH   
	





r   c              	   C   s  | j rt| S tjjt d}| jdur|| j dksJ | j| }|}| j	dur/| j	| }| j
}|dk r:|| }| j}|durI|dk rI|| }t| }i d| jd|d| jd| jd| jd	d
d| jd|d| jd| jd| jd| jd| jd| j d|d| jd| j| j| j| jt | j| jd}t|}	dg|	d< |r||	d< | j r| j |	d< | j!dur| j!|	d< d\}
}}| jdurt"j#di |}
t"$|r|
\}
}}||	d< |du r| j%dur| j%|	d< t"j#di |	}||	d< |du r| j&dur| j&|	d< t"j#di |	}| j'p| j(}|
dur7| jdkr7t)|
||| j*| | j+|d}
d| _,nd| _,|dkrA|n|}|durYt)|||| j*| | j+|d}d| _-nd| _-|durwt)|||t.|| d | | j+|d}d| _/nd| _/|
||fS )zmakes training/val/testrd   Nr   pathr   
mem_lengthdelimtext_keyZ	label_keyr/   Zds_typesplitlooser   presplit_sentencessample_one_documentfilter_englishZpre_tokenizerU   save_splitsload_splits)save_test_datano_lazy_loaderr   Zdata_parallel_ranknon_sentence_starthalf_lazy_loader      ?)NNN)r   r   TFr   r   )0Zuse_tfrecordsr   rN   rO   r   r   r   r   r   eval_batch_sizer   r   	get_splitr   r   r   r   data_set_typeZ
loose_jsonr   r   r   r   Zno_pre_tokenizer   r   r   r   Zget_data_parallel_rankr   r   copyr   Zeval_text_keyr   Zmake_datasetZshould_splitr   r   rk   r   r   r   r   Zdo_trainZdo_validr   Zdo_test)rT   rU   r   r   r   r   r   r   r   Zeval_set_argsr   r   r   Z	use_blockr   r   r   rS     s   



	














rS   c                 C   s.  ddddddddd	d
ddd}d\}}t  dkr| j}| jd ur$| j}g g }}| jD ],}| }tj| j	|| }	|
t| ||	|d|dd |
t| ||	|d|dd q,t| j|}t| j|}tjjt  d}
| j|
 }| jd ur}| j|
 }t|||| j| dd}t|||| j| dd}||fS )NZMNLIZCoLAZMRPCZQNLIZQQPzSST-2ZAgnewsZyelp_review_polarity_csvZyelp_review_full_csvZYahooZSQuADZRACE)ZmnliZcolaZmrpcZqnliZqqpZsst2Zagnewszyelp-polarityz	yelp-fullZyahooZsquadZrace)NNr   r   T)Zpattern_ensembledevrd   )r   )r   ro   r   multi_seq_lengthZmulti_task_datalowerosr   joindata_dirappendZSuperGlueDatasetr
   rN   rO   r   r   r   multi_batch_sizer   r   )rT   rU   Z	task_dirsr   r   r   Ztrain_datasetsZvalid_datasetstaskr   r   r   r   r   r   build_multi_task_dataset  s   


		


r   c                    s   g }| j ddkrdd | j  dD }n| j ddkr*dd | j  dD }nt| j g}t|}|dk r?|d|  t|d	k rP|d
 t|d	k sE|dd	 }| jdur_d
|d< | jdurhd
|d< t|  fdd|D S )z=
    Get dataset splits from comma separated string list
    ,c                 S   r   r   floatr   sr   r   r   r     r   zget_split.<locals>.<listcomp>/c                 S   r   r   r   r   r   r   r   r     r   r   r      r`   Nr0   c                    s   g | ]}|  qS r   r   r   Z	final_sumr   r   r     r   )r   findr   r"   r   r   r   r   )rT   ZsplitsZsplit_totalr   r   r   r     s&   


r   c               
   C   s"   dddddddddd	} t | d	S )
z*add cmdline flags for configuring datasetsr   r   r   FZ
supervised   d   )	r   r   Zpersist_stateZlazyZ	transposer   r   r   Zsamples_per_shardrM   )rK   r   r   r   r   configure_data  s   
r   )FF)!__doc__r   r   r?   bisectr   	itertoolsr   numpyr   rN   Ztorch.utils.dataZmegatron_utilr   r    r   Zblocklm_utilsr   Zdata_utils.tokenizationr	   r   r6   ZDatasetr
   rK   rw   r   r   rS   r   r   r   r   r   r   r   <module>   s0   J0
I'wB