o
    "jH                     @   s   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ dd	 Zd
d ZG dd deZdS )    N)coreunique_name)global_scope)Variable
name_scope)LayerHelper)ClipGradByGlobalNorm)	Optimizerc           
   
      s~  t jd  dd  dD    } fdd|D }|}td}| j|dtjj	j
d}t rG| jd	i d
|i||||dd n0t r\| jdi d
|i||||dd ntj jtj v rw| jdi d
|i||||dd | jdd|ii t|||d dd | jtdd}	| jdd
|	iddid | jdd|	id
|	i|ddd | jdd|	id
|	id |S )NZPADDLE_TRAINER_ENDPOINTSc                 S   s   g | ]
}|  r|  qS  )strip).0epr
   r
   q/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/incubate/optimizer/distributed_fused_lamb.py
<listcomp>       z%init_communicator.<locals>.<listcomp>,c                    s   g | ]
}|kr | qS r
   r
   )r   repsrankr
   r   r      r   Zcomm_idT)namepersistabletypeZc_gen_nccl_idOut)r   ZendpointZother_endpointsring_idr   inputsoutputsattrsZc_gen_bkcl_idZc_gen_xccl_idZc_comm_initX)nranksr   r   Z	endpointstmp)r   Zfill_constantvalue   )r   r   r   Zc_allreduce_sumr   Zuse_calc_streamZc_sync_calc_stream)r   r   r   )osenvironsplitindexr   generate
create_varr   VarDescVarTypeZRAWZis_compiled_with_cuda	append_opZis_compiled_with_xpupaddledistributedZParallelEnvZdevice_typeZdeviceZget_all_custom_device_typelenjoin)
blockr   Zranksr   Zcur_epZ	other_eps
local_rankZcomm_var_nameZcomm_id_varZtmp_varr
   r   r   init_communicator   s   



r4   c                 C   s.   |D ]}| j dd|id|i|ddd qd S )NZc_broadcastr   r   Tr$   r   )r-   )r2   
parametersr   pr
   r
   r   broadcast_parametersi   s   r7   c                       s   e Zd Z																	
	d% fdd	Zdd Zdd Zdd Zdd Zdd Zdd Z	ddgdfddZ
d&ddZdd  Zd!d" Zd#d$ Z  ZS )'DistributedFusedLambMbP?{Gz??+?ư>NT   r#   Fc                    sF  t  rJ dt j|d |d || _|| _|| _|d ur |nd| _|d ur4t|t	s0J d|j
}nd}|| _|d ur?|nd| _|	| _|
| _|| _d | _|| _|| _|| _|| _|| _| jdksdJ td| _d	| _| jj }|jtd
dgtjj j!d| _"d | _#| jdkr|jtddgtjj j!d| _$nd | _$i | _%d S )Nz2DistributedFusedLamb does not support dygraph mode)learning_rate	grad_clipr           z>Only ClipGradByGlobalNorm is supported in DistributedFusedLambg      r#   distributed_fused_lambTZ	found_inf)r   shapedtypeZstop_update)&r.   Zin_dynamic_modesuper__init___beta1_beta2_epsilon_weight_decay
isinstancer   Z	clip_norm_max_global_grad_norm
_alignment_clip_after_allreduce_is_grad_scaled_by_nranks_exclude_from_weight_decay_fn_scale_use_master_param_norm_gradient_accumulation_steps_use_master_acc_grad_nproc_per_node_use_hierarchical_allreducer   helperZ_supports_check_nan_infmain_programglobal_blockr*   r   r)   r   r+   r,   ZBOOL
_found_inf_step_stop_update_param_to_master_param)selfr?   Zlamb_weight_decaybeta1beta2epsilonr5   r@   Zexclude_from_weight_decay_fnclip_after_allreduceis_grad_scaled_by_nranks	alignmentuse_master_param_normZgradient_accumulation_stepsuse_master_acc_gradnproc_per_nodeuse_hierarchical_allreducer   max_global_grad_norm
main_block	__class__r
   r   rG   t   s^   



zDistributedFusedLamb.__init__c                 C   s   | j d ur| j S dS )NF)r]   r_   r
   r
   r   _get_stop_update_var   s   z)DistributedFusedLamb._get_stop_update_varc                 C   s
   || _ d S N)r\   )r_   stepr
   r
   r   	_set_step   s   
zDistributedFusedLamb._set_stepc                 C   s    | j d u r| jddd| _ | j S )Nrq   int64rE   )r\   _create_persistable_varrn   r
   r
   r   _get_or_create_step   s   
z(DistributedFusedLamb._get_or_create_stepc                 C   s*   |d usJ t |ts| |}|| _d S rp   )rL   r   _create_scale_from_constantrR   )r_   scaler
   r
   r   
_set_scale   s   


zDistributedFusedLamb._set_scalec                 C   s&   t d}tjj|dgdt|ddS )NZglobal_scaler#   float32T)r   rD   rE   r"   r   )r   r)   r.   ZstaticZcreate_global_varfloat)r_   r"   r   r
   r
   r   rw      s   
z0DistributedFusedLamb._create_scale_from_constantc                 C   s   | j d u r| d| _ | j S )Ng      ?)rR   rw   rn   r
   r
   r   _get_or_create_scale   s   
z)DistributedFusedLamb._get_or_create_scalerB   rz   c                 C   s\   | j j }|d urt|}|j|||ddd}| j j }|j|j|j|j	ddd}|S )NT)r   rD   rE   r   Zstop_gradient)
rX   startup_programrZ   r   r)   r*   rY   r   rD   rE   )r_   r   rD   rE   startup_blockZstartup_varrk   Zmain_varr
   r
   r   ru      s&   
z,DistributedFusedLamb._create_persistable_varc                 C   s   |d u rt  }| j|}|d usJ || }| tjjj	ks%J || }| tjjj	krC|
 |
 ks?J |d fS | tjjjksNJ | | ksXJ ||fS rp   )r   r^   getZfind_varZ
get_tensorZ_dtyper   r+   r,   ZFP32Z_ptrZFP16rD   )r_   r   scopeZmaster_paramZmaster_param_tZparam_tr
   r
   r   _get_parameter   s   z#DistributedFusedLamb._get_parameterc                 C   s   |  | d S rp   )apply_gradients)r_   params_gradsr
   r
   r   apply_optimize  s   z#DistributedFusedLamb.apply_optimizec              	   C   s   g }|D ]\}}| ||g q|d jj|* td | | W d    n1 s.w   Y  W d    d S W d    d S 1 sFw   Y  d S )Nr   Z	optimizer)extendr2   programZ_optimized_guardr   _apply_gradients_impl)r_   r   Z	flattenedr6   gr
   r
   r   r     s   Pz$DistributedFusedLamb.apply_gradientsc           .      C   s  |D ]\}}|j tjjjksJ dd|_q| d}| d}| jddd}| jddd}g }|D ]\}}| d	}	|	j| j|j< |	|	 q2| d
}
d|
_
| d}d|_
| d}| d}| jddd}d|_
| jddd}| jddd}d|_
| jddd}d|_
| jddd}d|_
| jdkr| dg}| jdddg}| jdddg}ng }g }g }|  }tj }tj }| jd u r|}n| j}|| dksJ d||k}|| }t|| }t|| }g }| jj }|dkrt||tt|d} |	|  d}!|dkrVt|dkrV|rVtt|| |d | }"t|||"d} |	|  | jrV||krVd}!tt|| ||}#t|||#|d d } |	|  |  }$dd |D }%dd |D }&dgt|% }'| jd urt|%D ]\}(}| |rd|'|(< qy|&D ]}|j|j|j |j|j|j d  q|dkrt!||%|d  |j"d!|%|&d"i d#|gd$|gd%|gd&|gd'|
gd(|gd)|gd*|gd+|$gd,|gd-|%d.|d/|&d0|gd1|gd2|gd3|gd4|gi| j#|r |n||r|n||'d5d5| j$| j%d6d7 | jj& })| '  d }*|D ]!}+|*d u r/| (|+}*q!| (|+},t)|*t)|,ksAJ d8q!|*d usJJ |)j"d9i d#|gd$|gd%|gd&|gd:|*gd'|
gd(|gd)|gd*|gd+|$gd,|gd;|%d<|&d2|gd0|gd1|gd3|g|g|g|
g|g|g|g|%|&| j*g|||| j+d ur| j+ng |gd=| j,| j$| j%| j-| j.| j/|||| j0| j1| j| j2|!d>d7}-|-gS )?NzOnly support dense gradientTfp32_fused_paramfp32_fused_gradfp16_fused_paramZfloat16rt   fp16_fused_gradZmaster_weightmoment1moment2beta1powbeta2pow
param_infoZint32fused_offsetsfp32_partial_fused_offsetsfp16_partial_fused_offsetsparam_orderr#   fp32_acc_fused_gradfp16_acc_fused_gradacc_steprs   r   z2nranks should be exactly divided by nproc_per_nodeFrB   c                 S   s   g | ]\}}|qS r
   r
   )r   r6   _r
   r
   r   r         z>DistributedFusedLamb._apply_gradients_impl.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r
   r
   )r   r   r   r
   r
   r   r     r   )r   r   rE   r   rD   Zdistributed_fused_lamb_init)ParamGradZFP32FusedParamZFP32FusedGradZFP16FusedParamZFP16FusedGradZMoment1ZMoment2ZBeta1PowZBeta2PowZGlobalScaleZ	ParamInfoParamOutZMasterParamOutGradOutZFP32ShardFusedParamOffsetsZFP16ShardFusedParamOffsetsZFusedParamOffsetsZ
ParamOrderSteprA   )re   r   r    apply_weight_decayr   r   r`   ra   r   z7The learning rate for each parameter should be the samerC   ZLearningRater   r   )ZFP32FusedParamOutZFP16FusedParamOutZ
Moment1OutZ
Moment2OutZBeta1PowOutZBeta2PowOutr   r   ZFoundInfZFP32AccFusedGradZFP16AccFusedGradZAccStepZ
StopUpdater   )Zweight_decayr`   ra   rb   rj   rc   r   r    ring_idsrf   rd   Z	acc_stepsrg   ri   )3r   r   r+   r,   Z
LOD_TENSORr   ru   r   r^   appendZis_distributedrT   rv   r.   r/   Zget_rankZget_world_sizerV   intrX   r}   rZ   r4   listranger0   rW   r|   rQ   	enumerater*   rE   rD   r7   r-   rN   rH   rI   rY   Z_create_global_learning_rateZ_create_param_lridr[   r]   rK   rJ   rM   rO   rS   rP   rU   ).r_   r   r6   r   r   r   r   r   Zmaster_paramsZmaster_pr   r   r   r   r   r   r   r   r   r   r   r   rq   r   r    rh   Zshard_inside_noder3   Znode_idZnode_numr   r~   r   ri   Zlocal_group_ranksZouter_group_ranksrx   paramsZgradsr   irk   lrZp_gZnew_lrZlamb_opr
   r
   r   r     s  
















	
&


	
8z*DistributedFusedLamb._apply_gradients_impl)r9   r:   r;   r<   r=   NNNTTr>   Tr#   TNFNrp   )__name__
__module____qualname__rG   ro   rr   rv   ry   rw   r|   ru   r   r   r   r   __classcell__r
   r
   rl   r   r8   s   s<    I

	r8   )r%   r.   Zpaddle.baser   r   Zpaddle.base.executorr   Zpaddle.base.frameworkr   r   Zpaddle.base.layer_helperr   Z	paddle.nnr   Zpaddle.optimizerr	   r4   r7   r8   r
   r
   r
   r   <module>   s   O
