o
    "j<                     @   s2  d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZmZ d d	lmZmZmZmZmZmZmZmZmZ d d
lmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) e Z*g dZ+dd Z,dd Z-G dd dZ.G dd dZ/e)dG dd de(Z0dS )    N)
check_typecheck_variable_and_dtype)OperatorDistAttr)get_world_process_group)6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_var_dist_attr)OP_ROLE_KEYOpRole)core)AutoMixedPrecisionListsBF16_is_in_fp32_varnames)	AutoMixedPrecisionLists_is_in_black_varnames_keep_fp32_input_keep_fp32_output_rename_arg_valid_typesfind_op_indexfind_true_post_opfind_true_prev_op)unique_name   )ProcessMesh)is_backward_opis_forward_opis_loss_grad_op
is_loss_opis_optimize_op   )PassBaseregister_pass)Zcreate_py_readerZcreate_double_buffer_readercastwhilec                 C   s(   | t jjjkr	dS | t jjjkrdS dS )NZfp16Zbf16Zfp32)r
   VarDescVarTypeFP16BF16)dtype r(   l/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/passes/auto_parallel_amp.py_dtype_to_strB   s
   r*   c                 C   s.   | dkr	t jjjS | dkrt jjjS t jjjS Nfloat16bfloat16)r
   r#   r$   r%   r&   FP32)Zdstrr(   r(   r)   _str_to_dtypeK   s
   


r/   c                   @   s   e Zd Z				dddZedd Zedd Zed	d
 Zedd Zedd Z	edd Z
edd Zdd Zdd Zdd ZdS )AMPListsNr,   c                 C   sn   d | _ |dkrtt|t|t|| _ n|dkr&tt|t|t|| _ | j d us-J || _|dk| _d S r+   )	_amp_listr   setr   _dtype_is_float16)self
white_list
black_listblack_varnamesr'   r(   r(   r)   __init__U   s   zAMPLists.__init__c                 C      | j r| jjS | jjS N)r4   r1   r6   Z	bf16_listr5   r(   r(   r)   r6   j      zAMPLists.white_listc                 C   r:   r;   )r4   r1   r7   Z	fp32_listr<   r(   r(   r)   r7   q   r=   zAMPLists.black_listc                 C   s   | j jS r;   )r1   	gray_listr<   r(   r(   r)   r>   x   s   zAMPLists.gray_listc                 C   r:   r;   )r4   r1   r8   Zfp32_varnamesr<   r(   r(   r)   r8   |   r=   zAMPLists.black_varnamesc                 C      | j S r;   )r4   r<   r(   r(   r)   is_fp16      zAMPLists.is_fp16c                 C   r?   r;   )r3   r<   r(   r(   r)   r'      rA   zAMPLists.dtypec                 C   r?   r;   )r1   r<   r(   r(   r)   amp_list   rA   zAMPLists.amp_listc                 C   s   | j r	t|| jS t|| jS r;   )r4   r   r1   r   )r5   opr(   r(   r)   _is_in_black_fp32_varnames   s   z#AMPLists._is_in_black_fp32_varnamesc                 C   s8   | j rt||S |jdv r|dkS |jdkr|dvS dS )N)
batch_norm
layer_normXfused_bn_add_activation>   rG   ZF)r4   r   type)r5   rC   in_namer(   r(   r)   _op_keep_fp32_input   s   


zAMPLists._op_keep_fp32_inputc                 C   s&   | j rt||S |jdv r|dkS dS )N)rE   rH   rF   YF)r4   r   rJ   )r5   rC   out_namer(   r(   r)   _op_keep_fp32_output   s
   

zAMPLists._op_keep_fp32_output)NNNr,   )__name__
__module____qualname__r9   propertyr6   r7   r>   r8   r@   r'   rB   rD   rL   rO   r(   r(   r(   r)   r0   T   s.    








r0   c                   @   sD   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dS )AMPStatec                 C   s8   || _ || _|| _|| _|jj| _i | _i | _i | _	d S r;   )
programdist_context	amp_lists	amp_dtypedist_op_contextZgrad_op_id_to_op_idgrad_op_to_op_map_op_fp16_dict_var_name_dictout_var_op_deps)r5   rU   rW   rX   rV   r(   r(   r)   r9      s   
zAMPState.__init__c                 C   s   | j |d S r;   )r[   get)r5   Zop_idr(   r(   r)   _is_fp16_op   s   zAMPState._is_fp16_opc                 C   s  d}| j jD ]s}|jD ]m}|jD ]}|| jvr!|j g| j|< q| j| |j g qt|r4d}|j	t
v r:qt|rG| ||j| qt|rr|j | jv rq| j|j  }|| jv sfJ t|| || j|j < qt|rx nqq| j jD ]}| | q~|S )NFT)rU   blocksopsoutput_arg_namesr]   descoriginal_idextendr   rJ   __amp_skip_ops__r   _mark_black_white_opsr   rZ   r[   strr_   r   _cast_block)r5   is_trainblockrC   name	fwd_op_idr(   r(   r)   build_state   sB   





zAMPState.build_statec                 C   s
  |j dkrd|jd v rd| j|j < d S |j dkrG|jd }t| j| dkrG| | j| d s=d| j|j < d S d| j|j < d S | j	j
d ur]| j	|r]d| j|j < d S |j | j	jv rnd| j|j < d S |j | j	jv rd| j|j < d S |j | j	jv rd}d}|jD ]R}|r||D ]H}||}	|	jd u rq|	j|u rt|||}
|
d u rqn|	j}
| |
j du s|
j | j	jv rd}q| |
j du s|
j | j	jv rd}qq|rd| j|j < d S |rd| j|j < d S 	 d S d| j|j < d S )NZassignZarray_r   Fr   T)rJ   input_arg_namesr[   rc   rd   rb   lenr]   r_   rW   r8   rD   r7   r6   r>   input_namesinput_var_recursiverC   r   )r5   rC   ra   rk   rN   Zis_black_opZis_white_oprK   in_var_namein_varZprev_opr(   r(   r)   rg      sd   






zAMPState._mark_black_white_opsc              	   C   s,  d}d}|t |jk r|j| }d}|jtv r|d7 }qt|r| |j du r>| |||t	| j
tjjj| j}nE| |j du r| j
dkrw|dr_|dd |dd n|drw|dtjjjkrw|dtjjj | |||tjjjt	| j
| j}nt|r| j|}t|rt|j|d  st|j|d  r|js|d7 }|j | jv r'| |j du r| |||t	| j
tjjj| j|}n| |j du r&| j
dkr|dr|dd |dd n|dr|dtjjjkr|dtjjj | |||tjjjt	| j
| j|}n\|jd	kro|j d }|j d }||}	||}
|jD ]}|
j||jkseJ |
 d
|| d
t| qH|	j |
j nt!|ddkrzn	t"d|j d||d 7 }|t |jk s|#  d S )Nr   r   FTr-   Z
use_mkldnnZmkldnn_data_typer'   sumz, op_rolei  'z/' op is not supported in the complete amp pass.)$rp   ra   rJ   rf   r   r_   rc   rd   _insert_cast_op_forwardr/   rX   r
   r#   r$   r.   rV   has_attr	_set_attrattrr&   r   get_op_dist_attr_for_programr   Zis_recomputerZ   _insert_cast_op_backwardrb   ro   var_find_var_recursiver'   rh   	set_dtypeint
ValueError_sync_with_cpp)r5   rk   idxappended_grad_timesrC   num_cast_opsZop_dist_attrout_var_namert   out_varru   r(   r(   r)   ri   )  s   






	


dzAMPState._cast_blockc              
   C   sl  d}i }|j D ]}	|tjjjkr| j||	rq||	D ]}
||
}|j	t
vs.|j|kr/q|j|kr|jd t| }|j|}|||j< ||}|dusSJ |du s\|j|kr||j}|dushJ |j}|j}||| |j||d|jd}t|||| d}|dr|d}|j|dd	|id
|i|j|jdd}|d| t|||| |d7 }n||j}||| t||j| q|dr|d| qq|| j|j  < |tjjjkr4|t!| j"kr4|j#D ]A}| j$||rq|%|D ]1}|&|}|j	t
vrq|jtjjjkr1|j't!| j" |dr1|dt!| j" qq|S )zO
        only for forward cast
        modified from paddle.static.amp
        r   z.cast_NF)rl   r'   persistablestop_gradient/op_namescoper!   rG   Out)in_dtype	out_dtyperJ   inputsoutputsattrsr   r   r   )(rq   r
   r#   r$   r.   rW   rL   rr   r   rJ   r   r'   rl   r*   varsr^   r}   get_input_dist_attrprocess_meshdims_mappingset_input_dist_attr
create_varr   r   rz   r|   Z_insert_op_without_syncr{   r   r   r\   rc   rd   r/   rX   output_namesrO   outputrs   r   )r5   rk   rC   r   	src_dtype	dst_dtyperV   r   Zvar_name_dictrK   rt   ru   	cast_namecast_varconsume_op_attrin_var_dist_attrref_meshref_mappingr   cast_oprN   r   r   r(   r(   r)   ry     s   








D



z AMPState._insert_cast_op_forwardc                  C   s,  dd }dd }	d}
|j  }|j}| j| }|jD ]r}|tjjjkr@|||r@|	|D ]}|
|}|jtjjjks>J q-q|	|D ]F}|
|}|j|kr||}|| j| v rw| j| | }|j || ||}||| qE|j|ksJ d|j|||jt|qEq|jD ]}|tjjjkr|	||r||D ]}|
|}|jtjjjksJ qq||D ]}|
|}|d|d }|
|}|j|jkr|j |j |j|kr|| j| v r||}| j| | }d	}d
|v r||d
d }|d | }|j|}|du s|j|kr|j || ||}|j}|j}||| |dus>J |j||j |d|j!d}t"|||| ||j#| |< |j$|d dd|id|i|j|jt%j&dd}|'d |'d |'d t(|||| |
d7 }
q|j|ksJ qq|
S )zonly for backward castc                 S   s   | j }|dv r|dvS dS )NZlayer_norm_grad>   rG   Y@GRADFrJ   )rC   rK   op_typer(   r(   r)   r        z;AMPState._insert_cast_op_backward.<locals>._keep_fp32_inputc                 S   s   | j }|dv r|dkS dS )Nr   X@GRADFr   )rC   rN   r   r(   r(   r)   r     r   z<AMPState._insert_cast_op_backward.<locals>._keep_fp32_outputr   z;op [{}] expect input [{}] to be dtype [{}] BUT got [{}]. {}N@ z@RENAME@GRADF)rl   shaper'   r   r   r   r!   rG   r   r   r   rw   r   op_role_varr   Zwith_quant_attr))rc   rd   rY   rZ   rq   r
   r#   r$   r.   rr   rs   r'   r}   r\   Z_rename_inputr   r   formatrJ   rh   r   r   findr   r   r^   _rename_outputZget_output_dist_attrr   r   Zset_output_dist_attrr   r   r   r   Zgrad_var_to_var
_insert_opr	   Backward_remove_attrr   ) r5   rk   rC   r   r   r   rV   r   r   r   r   rd   rY   rm   rK   rt   ru   r   r   r   rN   r   r   Zout_var_name_prefixZfwd_varZfwd_cast_namesuffixr   Zout_var_dist_attrr   r   r   r(   r(   r)   r~     s   












Jz!AMPState._insert_cast_op_backwardN)
rP   rQ   rR   r9   r_   rn   rg   ri   ry   r~   r(   r(   r(   r)   rT      s    'BihrT   Zauto_parallel_ampc                       sl   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Z  ZS )AMPPassc                    s   t    | dd | dd  | dd  | dd  | dd  | dd  | dd	 | d
d | dd | dd | dd | dd | dg  | dg  | dd d | _d | _d | _d | _d S )Nr'   r   lossrV   custom_white_listcustom_black_listcustom_black_varnamesinit_loss_scalingg      @incr_every_n_stepsi  decr_every_n_nan_or_infr   
incr_ratiog       @
decr_ratiog?use_dynamic_loss_scalingFZ
input_dataparams_grads)superr9   set_attr_loss_loss_scaling_num_good_steps_num_bad_stepsr<   	__class__r(   r)   r9     s(   

zAMPPass.__init__c                 C   s   |  ddvr	dS |  ddk rdS |  ddk rdS |  ddk r$dS |  ddk r-dS |  d	dk r6dS |  d
d u r?dS dS )Nr'   )r,   r-   Fr   r   r   r   r   r   rV   T)get_attrr<   r(   r(   r)   _check_self  s   zAMPPass._check_selfc                 C   s   dS )NTr(   )r5   Z
other_passr(   r(   r)   _check_conflict  s   zAMPPass._check_conflictc           	      C   s@  |  d| _|  d| _|  d| _tt|  dt|  dt|  d| j}tj||f t	||| j| j}|
 }|rJ|   | | j |r~| jdkr|   |   |  dse|  d	d
krk|  \}}|  dr| || W d    d S W d    d S W d    d S W d    d S 1 sw   Y  d S )NrV   r   r'   r   r   r   r,   r   r         ?)r   rV   r   rX   r0   r2   paddlestaticZprogram_guardrT   rn   _update_backward_cast_ops
_cast_loss_init_amp_var_scale_loss_check_and_update_gradient_update_loss_scaling)	r5   Zmain_programZstartup_programcontextrW   Z	amp_staterj   grads	found_infr(   r(   r)   _apply_single_impl  sB   


"zAMPPass._apply_single_implc              	   C   s  t j  }|  | jD ]\}}|j}|jtj	j
jkr|jdkrt|dttjkr8|dr8|d t|j||j}|rNtd| d|d  ||jd krVq|j }||j t jj||ddddd	}|j| | j|}| j||jd }	|dusJ |	dusJ t ||j!|j"| j |j!|	_!|j"|	_"t#|j|j}
|
dkrtd
| d|j$|
dd q|  dS )zo
        move param grad cast to the end of backward segment
        in order to enabel fp16 allreduce
        r!   rw   r   zThe cast op zH's output should not beused by a non-optimize op, however, itis used by r   N)rk   rc   rJ   r   r   r   zThe op z is not in programF)sync)%r   r   default_main_programglobal_blockr   r   rC   r'   r
   r#   r$   r.   rJ   r   r|   r	   r   rz   r   r   ra   rl   r   rc   	append_opZ	copy_fromOperatorappendrV    get_tensor_dist_attr_for_programr   rb   r   r   r   r   Z
_remove_op)r5   
main_blockpgrC   Zpost_opsZnew_op_descnew_opZparam_dist_attrZoutput_dist_attrZop_idxr(   r(   r)   r     sj   


z!AMPPass._update_backward_cast_opsc                 C   sP  t j  }|  dd | jD }t|dttfd |D ]}t	|dg dd q|j
tdddgd	gd
tjjjddd}t| j|dgtj || jd}||d}dtji}|jd|||d}t|j}	ttj|	_d|	_ttjd	kr|d|	_ |D ]}
| j!|
}|d usJ |	"|
j#|j$ |	%|
j#|j$ q~| j&||	 ||fS )Nc                 S   s   g | ]\}}|qS r(   r(   ).0_r   r(   r(   r)   
<listcomp>-  s    z6AMPPass._check_and_update_gradient.<locals>.<listcomp>xZcheck_finite_and_unscaler,   float32float64.Zfind_infinite_scaletmpr   boolF)rl   r   r'   rJ   r   r   r   )rG   ZScale)r   FoundInfiniterw   r   r   )'r   r   r   r   r   r   r   tuplelistr   r   r   Zgenerate_with_ignorable_keyjoinr
   r#   r$   Z
LOD_TENSORr   rV   world_process_groupranksr   r	   Optimizer   r   rc   r   r   impl_idxrp   	impl_typer   set_input_dims_mappingrl   r   set_output_dims_mappingset_op_dist_attr_for_program)r5   r   r   er   r   r   r   r   new_op_dist_attrr   g_dist_attrr(   r(   r)   r   )  sb   



	z"AMPPass._check_and_update_gradientc                 C   s   t jjtddg| dddd| _t| j| jdgt	j
 | dr[t jjtd	dgd
ddd| _t| j| jdgt	j
 t jjtddgd
ddd| _t| j| jdgt	j
 d S d S )NZloss_scalingr   r   r   T)rl   r   valuer'   r   r   r   Znum_good_stepsr   Zint32Znum_bad_steps)r   r   Zcreate_global_varr   generater   r   r   rV   r   r   r   r   r<   r(   r(   r)   r   b  sR   
zAMPPass._init_amp_varc              	   C   sZ  t j  }|  | d}|d usJ |j}| j|}|j	t
jjjkr$t|jd }|j|t
jjjd}| j|}|j}	| j|| t|j|j}
|j|
d dd|gid|gi|j	t
jjj| t dd	}|ttj t||	d
d |jD | j d }d}t|j |
d  D ]\}}|j!dkrt"|r|}|d } nt#|r nq|d usJ d|jt|d |jt
jjj|j$d}t%| j|dd |jD |	 |j&d }|'||j t||	dd |jD | j |j|
| dd|gid|git
jjjt(|tj)dd	}t||	dd |jD | j |}|}| *d| || _+|  d S )Nr   z
.cast_fp32)rl   r'   r   r!   rG   r   r   r   c                 S      g | ]}d qS r   r(   r   ir(   r(   r)   r         z&AMPPass._cast_loss.<locals>.<listcomp>   fill_constantThere is not loss_grad op.r   rl   r   r'   r   c                 S   r  r  r(   r  r(   r(   r)   r     r  r   c                 S   r  r  r(   r  r(   r(   r)   r     r  c                 S   r  r  r(   r  r(   r(   r)   r     r  ),r   r   r   r   r   r   rC   rV   r}   r'   r
   r#   r$   r.   r   r  rl   r   r   r   Z set_tensor_dist_attr_for_programr   rc   r   	all_attrsr   r{   r	   Forwardr   r   	enumeratera   rJ   r   r   r   r   rb   r   r/   r   r   r   )r5   Ztarget_dtyper   r   loss_oploss_op_dist_attrtmp_nameZ	cast_lossZloss_dist_attrr   loss_op_idxr   first_backward_opZinsert_op_offsetr   rC   Zcast_loss_gradpre_grad_nameZcast_grad_opr(   r(   r)   r     s   



zAMPPass._cast_lossc                 C   s  t j  }| d}|d usJ |j}| j|}| ds(| ddkrXt|j	|j	}|j
}|jtd|j|j|jd}t| j|dd |jD | |j|d	 d
|g| jgdd|gid| t id}|ttj t||dd |jD | j d }	|j|d  D ]}
|
jdkrt|
r|
}	 nt|
r nq|	d usJ d|jtdd |j|j|jd}t| j|dd |jD | |	jd }|	||j  t|	|dd |jD | j |	|_|!  |j	|d }|"d |#d|j g |#d|j g |#d| jj g |$d|g |$dg  |ttj% |dd t j&||}|j'|d | |!  |j|d  }|jdksIJ t||dd |jD | j n|}|| _(|!  d S ) Nr   r   r   r   scaled_lossr  c                 S   r  r  r(   r  r(   r(   r)   r     r  z'AMPPass._scale_loss.<locals>.<listcomp>r   Zelementwise_mul)rG   rM   r   rw   r   c                 S   r  r  r(   r  r(   r(   r)   r     r  r
  r  r   c                 S   r  r  r(   r  r(   r(   r)   r   5  r  r   c                 S   r  r  r(   r  r(   r(   r)   r   ?  r  r	  Zelementwise_mul_gradzOut@GRADrG   rM   r   r   Zaxisr   c                 S   r  r  r(   r  r(   r(   r)   r   ^  r  ))r   r   r   r   r   rC   rV   r}   r   rc   r   r   r   r  r   r'   r   r   r   r   r  r   r{   r	   r  r   ra   rJ   r   r   rb   r   rl   r   set_typeZ	set_inputZ
set_outputr   r   insertr   )r5   r   r   r  r  r  r   r  Zelementwise_mul_opr  rC   Zscaled_loss_gradr  Zelementwise_mul_grad_op_descZelementwise_mul_grad_opr(   r(   r)   r     s   
	


zAMPPass._scale_lossc                 C   s  t j  }|  t| jdddgd t|dtt	fd |D ],}t|dg dd |j
tjjjkrA| jj
tjjjks@J dq | jj
|j
ksLJ dq ||| j| j| jd	}|| j| j| jd
}| d| d| d| d| dtjd}|jd|||d}t|j}	ttj|	_d|	_ttjdkrd|	_|D ]}
| j |
}|d usJ |	!|
j"|j# |	$|
j"|j# q| j%||	 |  d S )NZprev_loss_scalingr   r   Zupdate_loss_scalingr   r   zPThe dtype of prev_loss_scaling should be float32 when the dtype of x is float16.zAThe dtype of prev_loss_scaling should be equal to the dtype of x.)rG   r   ZPrevLossScalingZInGoodStepsZ
InBadSteps)r   ZLossScalingZOutGoodStepsZOutBadStepsr   r   r   r   stop_update)r   r   r   r   r  rw   r   r   r   )&r   r   r   r   r   r   r   r   r   r   r'   r
   r#   r$   r%   r.   r   r   r   r	   r   r   r   rc   r   r   r   r   r   rp   r   rV   r   r   rl   r   r   r   )r5   r   r   r   r   r   r   r   r   r   r   r  r(   r(   r)   r   f  sx   		
	zAMPPass._update_loss_scalingc                 C   s   | j r| j S | dS )Nr   )r   r   r<   r(   r(   r)   get_loss  s   
zAMPPass.get_loss)rP   rQ   rR   r9   r   r   r   r   r   r   r   r   r   r  __classcell__r(   r(   r   r)   r     s    "F9,dtKr   )1r   Zpaddle.base.data_feederr   r   Z6paddle.distributed.auto_parallel.static.dist_attributer   Z5paddle.distributed.auto_parallel.static.process_groupr   Z-paddle.distributed.auto_parallel.static.utilsr   r   Z/paddle.distributed.fleet.meta_optimizers.commonr   r	   Zpaddle.frameworkr
   Z paddle.static.amp.bf16.amp_utilsr   r   Zpaddle.static.amp.fp16_utilsr   r   r   r   r   r   r   r   r   Zpaddle.utilsr   Zauto_parallel.process_meshr   Zauto_parallel.static.utilsr   r   r   r   r   Z	pass_baser   r    r   rf   r*   r/   r0   rT   r   r(   r(   r(   r)   <module>   s0   ,		X   i