o
    *jv                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ G dd deZ e
j!ej"dG dd deZ"dS )    N)mpu)Hooks)EpochBasedTrainer)HOOKS)BestCkptSaverHookCheckpointHookCheckpointProcessor)LoadCheckpointHook)Hook)load_checkpointsave_checkpoint)DistributedParallelType)create_device)
get_logger)is_megatron_initialized)get_local_rankc                   @   sN   e Zd ZdZdd Zdd Zdd Zdd	 Z	
	dddZdd Z	dd Z
d
S )MpuProcessormodelc              	   C   sD   zt  }|dkrW dS t  }d|W S  ttfy!   Y dS w )N    z_mp_rank_{:02d})r   Z$get_tensor_model_parallel_world_sizeget_tensor_model_parallel_rankformatImportErrorAssertionError)selfZtp_world_sizemp_rank r   t/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/trainers/hooks/distributed/megatron_hook.py	rank_name   s   zMpuProcessor.rank_namec                 C   s   t  }d|}d| dS )Nz{:02d}Zmp_rank_z_model_states.pt)r   r   r   )r   r   rankr   r   r   get_bin_filename&   s   
zMpuProcessor.get_bin_filenamec                 C   s   t j  pt dkS Nr   )torchdistributedZis_initializedr   Zget_data_parallel_rankr   trainerr   r   r   should_save_on_rank+   s   
z MpuProcessor.should_save_on_rankc                 C   s6   |j }t|||| j tjtj|| jdd d S )NT)exist_ok)cfgr   Zcopy_files_and_dump_config_BIN_FILE_DIRosmakedirspathjoin)r   r%   
output_dirconfigr   r   r   prepare_output0   s   
zMpuProcessor.prepare_outputNTc                 C   s  | |j}||   tj }t|||r|jnd |r|jnd |dd tj	
|}tj	|}	|  }
tj	||	d |
 }t||dd |}tj	|| j|
}tj	|r[t| z	t|| W d S  ty } zt d| d| d| d t|| W Y d }~d S d }~ww )	NF)metaZ
with_model_)Z	with_metazLink z to z error: z@, changing to copy the bin file, this may case more space usage.)unwrap_moduler   r   r   TRAINER_STATE_SUFFIXr   Z	optimizerZlr_schedulerr*   r,   dirnamebasenamer    r-   r)   isfileunlinklinkOSErrorr   errorshutilcopyfile)r   r%   checkpoint_path_prefixr.   r1   Zsave_optimizersr   _train_state_filesave_dirprefixbin_fileZprefix_bin_fileZsrc_file	dest_fileer   r   r   save_checkpoints8   s<   

zMpuProcessor.save_checkpointsc                 C   s|   ||    tj }tj|rt| tj|}tj|}| 	 }tj
||d | }tj|r<t| d S d S Nr2   )r   r   r4   r*   r,   r7   remover5   r6   r    r-   )r   r%   r>   r?   r@   rA   rB   Zabsolute_filer   r   r   remove_checkpoints^   s   

zMpuProcessor.remove_checkpointsc                 C   s   | |j}tj|r"|}|  }tj||}t||d d  d S ||   t	j
 }	t||	|}
tj|}tj|}|  }tj||d | }t||d d  |
S rF   )r3   r   r*   r,   isdirr    r-   r   r   r   r4   r	   Zload_trainer_stater5   r6   )r   r>   r%   Zload_all_statestrictr   r@   rB   Z
model_filer?   r1   rA   r   r   r   load_checkpointsk   s$   
zMpuProcessor.load_checkpointsNT)__name__
__module____qualname__r)   r   r    r&   r0   rE   rH   rK   r   r   r   r   r      s    
&r   )module_namec                   @   sF   e Zd ZdZdd ZdefddZdd Zd	d
 Zdd Z	dd Z
dS )MegatronHookr   c                 C   s
   d| _ d S )NF)wrapped)r   r   r   r   __init__   s   
zMegatronHook.__init__r%   c                 C   s   t  }|t}t|dkrt|d jt s|d | |t}t|dkr7t|d jt s7|d | |t}t|dkrSt|d jt sU|d | d S d S d S r!   )	r   Zget_hookr   len
isinstance	processorZset_processorr   r	   )r   r%   rV   Z	ckpt_hookZbest_ckpt_hookZload_ckpt_hookr   r   r   register_processor   s"   




zMegatronHook.register_processorc                 C   sb   t  sJ t }td| |_|j|j t |jt	j
< t |jt	j< t |jt	j< d S )Nzcuda:)r   r   r   Zdevicer   tor   Zget_data_parallel_groupZparallel_groupsr   ZDPZget_tensor_model_parallel_groupZTPZ!get_pipeline_model_parallel_groupZPP)r   r%   Z
local_rankr   r   r   
after_init   s    
zMegatronHook.after_initc                 C      |  | d S Nwrap_moduler$   r   r   r   
before_run      zMegatronHook.before_runc                 C   rZ   r[   r\   r$   r   r   r   
before_val   r_   zMegatronHook.before_valc                 C   s,   |j r| js||j|_d| _d S d S d S rL   )_distrR   Zto_parallelr   r$   r   r   r   r]      s   
zMegatronHook.wrap_moduleN)rM   rN   rO   r)   rS   r   rW   rY   r^   r`   r]   r   r   r   r   rQ      s    rQ   )#r*   r<   r"   Zmegatron_utilr   Zmodelscope.metainfor   Zmodelscope.trainersr   Z!modelscope.trainers.hooks.builderr   Z4modelscope.trainers.hooks.checkpoint.checkpoint_hookr   r   r   Z9modelscope.trainers.hooks.checkpoint.load_checkpoint_hookr	   Zmodelscope.trainers.hooks.hookr
   Zmodelscope.utils.checkpointr   r   Zmodelscope.utils.constantr   Zmodelscope.utils.devicer   Zmodelscope.utils.loggerr   Zmodelscope.utils.megatron_utilsr   Zmodelscope.utils.torch_utilsr   r   Zregister_modulerQ   r   r   r   r   <module>   s&    k