o
    *jE                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ d ddZ				d!ddZdd Zdd Zdd Z 				d!ddZ!dd Z"d"ddZ#		d#ddZ$dS )$    N)	FusedAdam)mpu)DynamicLossScalerFP16_ModuleFP16_Optimizer)distributed   )DistributedDataParallel)GLMForMultiTokenClozeGLMForMultiTokenClozeFastGLMForSequenceClassificationGLMForSingleTokenClozeGLMModel)PyTorchDistributedDataParallel),glm_get_params_for_weight_decay_optimization)get_checkpoint_iterationget_checkpoint_nameprint_rank_0c                    s  t |\}}}}t|||}t dkrtdtj | tj	|dd}	 j
r,| j} t| tr4| j} t| tr<| j} t| drD| j}  fdd}
 jrd|	d	 v r{|	d	 d
 } jd |jd kr{|
||  d
 j|	d	 d
< td jd   d|	d	 v r|	d	 d } jd |jd kr|
||  d j|	d	 d< td jd   t|   D ]}t| |  ||  |dddd< q| j|	d	 dd\}}|s|rtd| d|   jr jr| j| j j!j| d S d S d S )Nr   z-global rank {} is loading pretrained model {}cpu)Zmap_locationmodelc                    s4   | j d }| jd ksJ | }| |d |< |S )Nr   r   )shapemax_position_embeddingsclone)Zstate_weightsZmodel_weightsZoriginal_lengthZnew_weightsargs g/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/train_utils.pyextend_embedding_weights%   s
   
z1load_pretrained.<locals>.extend_embedding_weightsz,transformer.block_position_embeddings.weightmodulez&transformer.position_embeddings.weightr   zExtend position embedding to z#Extend block position embedding to z@mixins.block_position_embedding.block_position_embeddings.weightz"transformer.word_embeddings.weightzword_embeddings.weightF)strictzMissing keys z, unexpected keys )"r   r   r   get_data_parallel_rankprintformattorchr   get_rankload	deepspeedr   
isinstanceTorchDDPr   hasattrr   block_lmr   r   Z
state_dictdatar   listkeyspopreplaceZload_state_dictZcontinuous_promptZprompt_initZprompt_spellZinit_embeddingZword_embeddingsweight)r   Zcheckpoint_pathr   Ztask_tokensZload_dirtagreleasesuccessZcheckpoint_namesdr   Zposition_weightsZblock_position_weightskeyZmissing_keysZunexpected_keysr   r   r   load_pretrained   s   





r6   Tc           	      C   s  t d | jr2|dkrtj| j| j| j| j| jd}n|dkr0t	j| j| j| j| j| j|d}nt
d\}}|dks>|dkrC| jsCd}|durId}|durTt d	|  td+i d
| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd|d| jd| jo| j d|d|d| jd| j}| jr|j| jd |dur|dkr| jr|r| jrt|| j d}n7t!|| j d}n/t"|| j#d}n't$|| j| j%| j&|d}n|dkrt$|| j| j%| j&|d}n	|d krnt
|t'( d!krt)d"*t'+ t,d#d$ |- D d%d& | j.r|/  |0t1j02  | j.r/t3|}| j4sc| j5s;| j6rc| j7d'krSt1j02 }t8||g|t'9 d(}|S | j7d)kr_t:|}|S t d* |S ),zBuild the model.zbuilding GPT2 model ...Zmultiple_choice)	cache_dirfp32_layernormfp32_embeddinglayernorm_epsilonZclassification)r7   r8   r9   r:   
num_labels)TTFNzContinuous spell length 
num_layers
vocab_sizehidden_sizenum_attention_headsZembedding_dropout_probZattention_dropout_probZoutput_dropout_probZmax_sequence_lengthZmax_memory_lengthcheckpoint_activationscheckpoint_num_layersZparallel_outputZrelative_encodingZblock_position_encodingoutput_predictspell_lengthZ
spell_funcattention_scale)tune_prefix_layers)length_penalty)Ztake_softmax)Z	num_classZ
generationr   z5 > number of parameters on model parallel rank {}: {}c                 S   s   g | ]}|  qS r   )Znelement).0pr   r   r   
<listcomp>   s    zget_model.<locals>.<listcomp>T)flushr#   )Z
device_idsZoutput_deviceZprocess_grouplocalzSkip DDP modelr   );r   Zpretrained_bertZBertForMultipleChoiceZfrom_pretrainedZtokenizer_model_typer7   r8   r9   r:   ZBertForSequenceClassificationNotImplementedErrorZ
cloze_evalr   r<   r=   r>   r?   Zhidden_dropoutZattention_dropoutr   Z
mem_lengthr@   rA   Ztransformer_xlr*   Z	masked_lmZprompt_funcrD   Zfreeze_transformerrE   Zfast_decoder   rF   r
   r   Zadapetr   Zoutput_dropoutZ
pool_tokenr   r    r!   r"   Zget_model_parallel_ranksum
parametersfp16Zhalfcudar#   Zcurrent_devicer   r&   train_itersepochsDDP_implr(   get_data_parallel_groupLocalDDP)	r   
model_typemulti_tokenr;   rC   r   rB   Zparalle_outputir   r   r   	get_modelX   s  	

	rY   c                 C   sZ   t | tttfr| j} t | tttfst| }|D ]}|d D ]
}t|ds)d|_qq|S )Nparamsmodel_parallelF)r'   rU   r(   r   r   r   r)   r[   )r   param_groupsZparam_groupparamr   r   r   get_optimizer_param_groups   s   
r^   c                 C   s   |j r|jrtjj}nddlm} |}|| |j|jd}n-|j	dkr3t
| |j|j|j|jf|jd}n|j	dkrHddlm} || |jddd	}nttd
|jj  t|dr]|jr]t|jrqt||j|j|j|j|jdd}|S )zSet up the optimizer.r   )DeepSpeedCPUAdam)lrweight_decayZadam)r`   ra   ZbetasepsZ	adafactor)	AdafactorF)r`   Zrelative_stepZwarmup_initzOptimizer = r&   )Zscale_window	min_scaleZdelayed_shift)Zstatic_loss_scaledynamic_loss_scaleZdynamic_loss_args)Zcpu_optimizerZcpu_torch_adamr#   ZoptimZAdamWZdeepspeed.ops.adamr_   r`   ra   	optimizerAdamZ
adam_beta1Z
adam_beta2Zadam_epsZtransformersrc   rL   r!   	__class____name__r)   r&   rO   r   Z
loss_scalere   Zloss_scale_windowrd   Z
hysteresis)r\   r   Zcpu_adam_optimizerr_   rf   rc   r   r   r   get_optimizer   sN   





rj   c              	   C   sd   |j dur	|j }n|j}|jr||j }td|}d}|j| }t| |j||| |j||j	d}|S )z"Build the learning rate scheduler.Nr   )Zstart_lrwarmup_iter	num_itersZdecay_styleZ	last_iterZdecay_ratio)
Zlr_decay_itersrQ   Zfinetunegradient_accumulation_stepsmaxZwarmupZAnnealingLRr`   Zlr_decay_styleZlr_decay_ratio)rf   r   rm   Z	init_steprl   lr_schedulerr   r   r   get_learning_rate_scheduler  s$   



	rq   c           
      C   s   t | ||||d}t|}| jdus!| jdurB| jdks!| jdkrB| jr7td tj||| t	dd\}}}}nt
|| }t|| }	nd\}}	|||	fS )zSetup model and optimizer.)rV   rW   r;   rC   Nr   zDeepSpeed is enabled.F)r   Zmodel_parametersr   r   Zdist_init_required)NN)rY   r^   Z
train_datadata_dirrR   rQ   r&   r   Z
initializer   rj   rq   )
r   rV   rW   r;   rC   r   r\   rf   _rp   r   r   r   setup_model_and_optimizer.  s.   

rt   c                 C   s   |}|j r|| n|jr| j|dd n|  |j s"|jdkr)|d  n|d  |jd|jd |d  |j s`|jrG| 	  |j
dkr`|jsZt| |j
 |S | |j
 |S )zBackward step.F)update_master_gradsr#   Z	allreduce)Zreduce_afterfp32_allreducer   )r&   backwardrO   rS   resetstartZallreduce_paramsrv   stopru   Z	clip_gradr   Zclip_grad_normrN   Zclip_master_grads)rf   r   lm_lossr   timersZlossr   r   r   backward_stepQ  s,   
r}   Fc                 C   s   |sd S t   t  dkrDt|  tdtj d d tdtj d d tdtj d d tdtj	 d d td d S d S )	Nr   zMemory Allocated i   @Z	GigaByteszMax Memory Allocated zCache Allocated zMax cache Allocated  )
distZbarrierr$   r!   r#   rP   Zmemory_allocatedZmax_memory_allocatedZmemory_cachedZmax_memory_cached)messageforcer   r   r   see_memory_usagez  s,   r   c	                 C   s  d\}	}
|du r
g n|}|j s|  	 d\}}|d  || ||||\}}}|d  |j s7||j }|  d}tj	j
|jt d |j|j|j  |_t|s|	|7 }	|
d7 }
|d  t||||| |d  |d	  |j r| r|  d}|jr|js|  n d}n|  n|
|jkr|  d}|jr|js|  nd}|d	  |rnntd
 ~~g }|rnq|j r|	|
 }	|	||fS )zSingle training step.)g        r   NT)r   Fforwardr   )grouprw   rf   zFound NaN loss, skip backward)r&   Z	zero_gradry   rz   rn   detachr   viewr#   r   Z
all_reducer+   r   rT   Z
world_sizeZmodel_parallel_sizer   Z_has_inf_or_nanr}   Z!is_gradient_accumulation_boundarysteprO   overflowr   )Zdata_iteratorr   rf   rp   r   r|   Zforward_step_funcZmemsZsingle_stepZlm_loss_totalcountZskipped_iterZcompleter{   rs   Zreduced_lossr   r   r   
train_step  sl   









9
r   )N)NTNN)F)NF)%r&   r#   Zapex.optimizersr   rg   Zmegatron_utilr   Zmegatron_util.fp16r   r   r   r   r   r   r	   rU   r
   r   r   r   r   r   r(   r   utilsr   r   r   r6   rY   r^   rj   rq   rt   r}   r   r   r   r   r   r   <module>   s<   
F
z3
#
)