o
    "jzR                     @   s"  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	 d dl
mZmZ ddlmZmZmZ dadadae	 ZdZdZd	Zd
Zdai Zd(ddZdd ZG dd dZG dd deZG dd deZ dd Z!dd Z"dd Z#dd Z$dd Z%d d! Z&d)d"d#Z'd$d% Z(d&d' Z)dS )*    N)current_thread)compilerunique_name)Programin_dygraph_mode   )CheckpointSaverPaddleModelSerializableBase
checkpointZmemory_initZdacpZacpauto_checkpointc                 C   sP   t d urt S t|a t |  dt _t }td}|| t | t S )NFz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)	loggerlogging	getLoggersetLevel	propagateStreamHandler	FormattersetFormatter
addHandler)Z	log_levelnameZlog_handlerZ
log_format r   p/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/base/incubate/checkpoint/auto_checkpoint.py_get_logger-   s   



r   c                   C   s   t  jdks
J dd S )N
MainThreadz*auto checkpoint must run under main thread)r   r   r   r   r   r   _thread_checker@   s   r   c                   @   s   e Zd Zdd Zdd Zdd Zdd Zed	d
 Zdd Z	dd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zd#S )$AutoCheckpointCheckerc              
   C   s  d | _ d | _d | _d | _d | _d | _d | _d | _d | _t	
d| _ | j dkr(d S zt	jd | _t	jd | _t	jd | _t	jd | _t	jd | _t	jd | _tt	jd	 | _tt	
d
d| _t	
dd| _tt	
dd| _| jst| jdkrt| jdkrt| jdkrt| jdksJ dW d S t| jdkrt| jdksJ dW d S  ty } ztd|  td W Y d }~d S d }~ww )NZPADDLE_RUNNING_ENVZPADDLE_EDL_AUTO_CHECKPOINTZPADDLE_RUNNING_PLATFORMZPADDLE_JOB_IDZPADDLE_EDL_HDFS_HOMEZPADDLE_EDL_HDFS_NAMEZPADDLE_EDL_HDFS_UGIZPADDLE_EDL_HDFS_CHECKPOINT_PATHZPADDLE_TRAINER_IDZPADDLE_EDL_ONLY_FOR_CE_TEST0ZPADDLE_EDL_FS_CACHEz.cacheZ PADDLE_EDL_SAVE_CHECKPOINT_INTERZ900      r   zhdfs environ must setz
exception:r   )_run_env	_platform_job_id
_hdfs_home
_hdfs_name	_hdfs_ugi_hdfs_checkpoint_path_trainer_id_ce_testosgetenvenvironint	_fs_cache_save_checkpoint_interlen	Exceptionr   fatalsysexit)selfer   r   r   __init__G   sV   



zAutoCheckpointChecker.__init__c                 C      | j  d| j d| S )N/z/range/hdfs_checkpoint_pathjob_idr4   r   r   r   r   get_range_checkpoint_pathy      z/AutoCheckpointChecker.get_range_checkpoint_pathc                 C   r7   )Nr8   z/exe/r9   r<   r   r   r   get_exe_checkpoint_path|   r>   z-AutoCheckpointChecker.get_exe_checkpoint_pathc                 C   s   | j  d| j S )Nr8   r9   r4   r   r   r   get_job_path   s   z"AutoCheckpointChecker.get_job_pathc                 C      | j S N)r.   r@   r   r   r   save_checkpoint_inter      z+AutoCheckpointChecker.save_checkpoint_interc                 C   sZ   t  rdS | jd uo,| jd uo,| jd uo,| jd uo,| jd uo,| jd uo,| jd uo,| jd uS )NF)	r   r    r!   r"   r#   r$   r%   r&   r'   r@   r   r   r   valid   s"   
zAutoCheckpointChecker.validc              
   C   s(   d | j| j| j| j| j| j| j| jS )Nzrun_env:{} platform:{} job_id:{}             hdfs_home:{} hdfs_name:{} hdfs_ugi:{}             hdfs_checkpoint_path:{} trainer_id:{} ce_test)	formatr    r!   r#   r$   r%   r&   r'   r(   r@   r   r   r   __str__   s   zAutoCheckpointChecker.__str__c                 C   rB   rC   )r'   r@   r   r   r   
trainer_id   rE   z AutoCheckpointChecker.trainer_idc                 C   rB   rC   )r    r@   r   r   r   run_env   rE   zAutoCheckpointChecker.run_envc                 C   rB   rC   )r!   r@   r   r   r   platform   rE   zAutoCheckpointChecker.platformc                 C   rB   rC   )r"   r@   r   r   r   r;      rE   zAutoCheckpointChecker.job_idc                 C   rB   rC   )r#   r@   r   r   r   	hdfs_home   rE   zAutoCheckpointChecker.hdfs_homec                 C   rB   rC   )r$   r@   r   r   r   	hdfs_name   rE   zAutoCheckpointChecker.hdfs_namec                 C   rB   rC   )r(   r@   r   r   r   ce_test   rE   zAutoCheckpointChecker.ce_testc                 C   rB   rC   )r%   r@   r   r   r   hdfs_ugi   rE   zAutoCheckpointChecker.hdfs_ugic                 C   rB   rC   )r&   r@   r   r   r   r:      rE   z*AutoCheckpointChecker.hdfs_checkpoint_pathc                   C   s   t dS )NZ_range_)	generatorr   r   r   r   generate_range_name   s   z)AutoCheckpointChecker.generate_range_nameN)__name__
__module____qualname__r6   r=   r?   rA   propertyrD   rF   rH   rI   rJ   rK   r;   rL   rM   rN   rO   r:   staticmethodrQ   r   r   r   r   r   F   s:    2









r   c                   @   sZ   e Zd Zdd Zdd Zdd Zdd Zd	gfd
dZdd Zdd Z	dd Z
dd ZdS )ExeTrainStatusc                 C   sF   d| _ d | _d | _d | _d | _d | _d | _d | _d | _d | _	d| _
d S )NZexe_train_status)	_epoch_no	_hash_key_key_checkpoint_path_checkpoint_no_restored_from_exe_program	_exe_name_program_name
_file_namer@   r   r   r   r6      s   
zExeTrainStatus.__init__c                 C   sT   | j |j ko)| j|jko)| j|jko)| j|jko)| j|jko)| j|jko)| j|jkS rC   )rY   rZ   r[   r\   r]   ra   rb   r4   tr   r   r   __eq__   s   





zExeTrainStatus.__eq__c                 C   s
   | |k S rC   r   rd   r   r   r   __ne__      
zExeTrainStatus.__ne__c                 C   R   | d| j  }t|d}|  }|| W d    d S 1 s"w   Y  d S Nr8   wrc   open
_serializewriter4   path	file_namefsr   r   r   	serialize   
   "zExeTrainStatus.serializerestored_fromc                 C   s(   |   }|D ]}||d  qt|S rC   )_to_dictpopjsondumps)r4   pop_keysdkr   r   r   rn      s   
zExeTrainStatus._serializec                 C   sV   d }| d| j  }t|d}| }| | W d    d S 1 s$w   Y  d S )Nr8   r)rc   rm   read_deserialize)r4   rq   r}   rr   rs   rt   r   r   r   deserialize   s   "zExeTrainStatus.deserializec                 C   sT   t |}|d | _|d | _|d | _|d | _|d | _|d | _|d | _d S )Nepoch_nokeyhash_keycheckpoint_pathcheckpoint_noexe_nameprogram_name)	rz   loadsrY   r[   rZ   r\   r]   ra   rb   )r4   rt   r}   r   r   r   r      s   






zExeTrainStatus._deserializec              	   C   s&   | j | j| j| j| j| j| j| jdS )N)r   r   r   r   rw   r   r   r   )rY   r[   rZ   r\   r^   ra   rb   r]   r@   r   r   r   rx     s   zExeTrainStatus._to_dictc                 C   
   |  g S rC   rn   r@   r   r   r   rH     rh   zExeTrainStatus.__str__N)rR   rS   rT   r6   rf   rg   ru   rn   r   r   rx   rH   r   r   r   r   rW      s    
rW   c                   @   s   e Zd Z	d#ddZdd Zdd Zd	d
 Zdd Zedd Z	dd Z
ddgfddZedd Zdd Zdd Zdd Zdd Zdd  Zd!d" ZdS )$TrainEpochRangeNTc                 C   s   || _ d| _|| _d | _i | _d| _t| _|d ur|| _n| jj	| _| jdks0J d| j dt

 | _d | _d | _| j sBd S d| _|sId S | j|| _| jj| jjd}| jjr_d }ddlm} || jj|| _t| j| _t  |   d S )	NrX   Fr   zcheckpointer:z	 must >=0Zrange_train_status)zfs.default.namezhadoop.job.ugi)
HDFSClient)_max_epoch_numrY   _namer^   _exe_status_flag_generated	g_checker_checkerr.   rD   time_last_checkpoint_time_load_cp_nos_checkpoint_epoch_norF   rc   r=   r\   rM   rO   rN   Z!paddle.distributed.fleet.utils.fsr   rL   _hdfsr   _cperr   _get_last_valid_checkpoint)r4   max_epoch_numr   checkpoint_interrestoredconfigr   r   r   r   r6     s@   


zTrainEpochRange.__init__c                 C   s   g }d}|d d d D ]C}t | j| jdd}| jj| j|g| jj|| jjd |	| t
d| d|   |dk rA|j}q||j dkrN||f  S qd	S )
NrX   F)r   r   local_cache_pathzlook for valid:z t:r   r   )NN)r   r   r   r   load_checkpointr\   r   rI   r-   appendr   debugrn   rY   )r4   Zcp_nosZcpsr   ire   r   r   r   _look_for_validJ  s&   
zTrainEpochRange._look_for_validc                 C   s  | j | j| _td| j  t| jdk rt| _d S t	t
krD| j j| j| g| jj| jjd t| _| j| _td|    d S t	tkr}| | j\}}|d u rYt| _d S | j j| j| g| jj|| jjd t| _| j| _td|    d S tdt	 )Nzfind checkpoint nos:r   r   z!load tain_epoch_range checkpoint:r   znot supported acp_type:)r   Zget_checkpoint_nor\   r   r   infor/   CONST_MEMORYINITr^   
g_acp_typeCONST_ACP_TYPEr   r   rI   r-   CONST_CHECKPOINTrY   r   rn   CONST_DACP_TYPEr   AssertionError)r4   re   r   r   r   r   r   _  s>   z*TrainEpochRange._get_last_valid_checkpointc                 C   s"   | j | j| j| j| j| jd}|S )N)r   r   r   r   rw   checkpoint_epoch_no)r   rY   r   r\   r^   r   )r4   r}   r   r   r   rx     s   zTrainEpochRange._to_dictc                 C   r   rC   r   r@   r   r   r   rH     rh   zTrainEpochRange.__str__c                 C   rB   rC   )r   r@   r   r   r   r     rE   zTrainEpochRange.namec                 C   ri   rj   rl   rp   r   r   r   ru     rv   zTrainEpochRange.serializerw   r   c                 C   sZ   |   }|D ]}||d  qi |d< |d }| j D ]\}}| ||j< qt|S )N
exe_status)rx   ry   r   itemsrn   r[   rz   r{   )r4   r|   r}   r~   r5   re   r   r   r   rn     s   
zTrainEpochRange._serializec                 C   rB   rC   )r^   r@   r   r   r   rw     rE   zTrainEpochRange.restored_fromc           	      C   s   d }| d| j  }t|d}t|}W d    n1 sw   Y  |d | _|d | _|d | _|d | _|d }| D ]\}}t	 }|
| || j|< q@d S )Nr8   r   r   r   r   r   r   )rc   rm   rz   loadr   rY   r   r\   r   rW   r   r   )	r4   rq   r}   rr   rs   r5   r~   vre   r   r   r   r     s   




zTrainEpochRange.deserializec                 c   s    t   | jdk rtj| _| jdksJ d| j dt | _| jd }td| d| j  t	|| jD ]}|| _|V  | 
  q7d S )Nr   rX   zself._epoch_no:z
 must >=-1r   zstarted epoch_no:z max_epoch_num:)r   r   r2   maxintrY   r   r   r   r   rangesave_checkpoint)r4   startr   r   r   r   next  s"   



zTrainEpochRange.nextc                 C   rB   rC   )rY   r@   r   r   r   get  s   zTrainEpochRange.getc                 C   sx   | j jdkr:t | j | jkr3ttkr&| jdkr%| j| jd kr%| 	  ntt
kr/| 	  ntdt | _d S d S )Nr   r   z#not supported acp_type:{g_acp_type})r   rI   r   r   r.   r   r   r   rY   _save_checkpointr   r   r@   r   r   r   r     s   

zTrainEpochRange.save_checkpointc                 C   s   | j  sdS | j}| j D ]=\}}t|j|j}| j |j}| 	 |_
| jj||g| j j| j jd\}}||_||_|||j< td|   qt| jdkrq| jj| j| g| j jd td|    |   dS dS )zc
        status => /jobid/xxx_range_xx/range/
        model =>                       /exe/
        Nr   zsave executor checkpoint:r   z"save train_epoch_range checkpoint:)r   rF   r   r   r	   r_   r`   r?   rZ   r   rY   r   r   rI   r-   r\   r]   r[   r   r   rn   r/   r   _generate_flag)r4   r5   r~   re   mprq   r   r   r   r   r     s8   



z TrainEpochRange._save_checkpointc                 C   sV   | j rd S d}| j d | }td | j| j  | jj|dd d| _ d S )Nzcan_be_auto_checkpoint.flagr8   zthis job can_be_auto_checkpointT)exist_ok)r   r   rA   r   r   r   Zmkdirstouch)r4   r   rq   r   r   r   r     s   

zTrainEpochRange._generate_flag)NT)rR   rS   rT   r6   r   r   rx   rH   rU   r   ru   rn   rw   r   r   r   r   r   r   r   r   r   r   r     s&    
3)

'r   c                   C   s   t S rC   )g_train_epoch_ranger   r   r   r   _get_train_epoch_range%  s   r   c                 C   sN   |   }d}d}t|jD ]\}}| rd}| rd}|r$|r$ dS qdS )NFT)global_block	enumerateopsZ_is_backward_opZ_is_optimize_op)programr   Zhas_backwardZhas_optidxopr   r   r   _check_program_oprole)  s   r   c                 C   s   t | tjst | tsdS t | tjr| jd u s| jjrdS n| jr$dS t| }|jtv r5t|j s4dS n%d}t |tjrCt	|j}nt	|}|t|j< |sZt
d|j d dS t oatd uS )NFzprogram z need't to auto checkpoint)
isinstancer   CompiledProgramr   r`   Z_is_distributed_get_valid_program_auto_checkpoint_nameg_program_attrr   r   r   r   rF   r   )progr   retr   r   r   _can_auto_checkpoint:  s4   


r   c                 C   s   |  d| S )N_r   )r   r   r   r   r   _get_running_key]  s   r   c                   C   s   t d td u rt atS )N   )r   r   r   r   r   r   r   _get_checkera  s   r   c                 c   s$    | dk rt j} td| E d H  d S )Nr   )r2   r   r   )r   r   r   r   _normal_yieldj  s   r   c                 c   s    t   std t| D ]}|V  qd S ttkr't| D ]}|V  qd S tatdt  zt	| t
 |dat D ]}|V  q?W d ad S d aw )Nz<auto checkpoint will take effect  automaticly on PaddleCloudz	acp_type:)r   )r   rF   r   warningr   r   r   r   r   r   r   rQ   r   r   )r   rD   r   r   r   r   train_epoch_rangep  s0   
r   c                 C   s   t | tjr	| jS | S rC   )r   r   r   r`   )r   r   r   r   r     s   r   c                 C   sL  t   | jd us
J t|sd S t|}|jd usJ tj}t| j|j}tjtkr8||v s8J d| dt d }||v ry|| }|j	d u rmt
tj}t| |}|jt||gtj|jtjd t|_	td|  | |_||_t |_n(t }t |_||_||_t|_	| |_||_| j|_|j|_|||< td t  d S )Nzwhen restored key:z must be in train_epoch_range:)rI   r   r   zload executor checkpoint z+not found checkpoint, so train from epoch 0) r   r   r   r   r   r   r   rw   r   r^   r   r   r	   r   r   r?   rI   r]   r-   r   r   r_   r`   r   rY   rW   rZ   r[   r   ra   rb   r   )exer   r   r   r   re   ar   r   r   r   _auto_checkpoint  sX   







r   )r   rC   )*rz   r   r)   r2   r   	threadingr   Zpaddle.baser   r   Zpaddle.base.frameworkr   r   Zcheckpoint_saverr   r	   r
   r   r   r   ZUniqueNameGeneratorrP   r   r   r   r   r   r   r   r   r   rW   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sH   
 J  #	
"