o
    "j1                     @   sN  d dl Z d dlmZ d dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ ddlmZ ddlmZ ddl m!Z! e
 Z"g dZ#da$da%dd Z&dd Z'dd Z(G dd dZ)dd Z*dd Z+dd Z,d d! Z-d*d#d$Z.d%d& Z/e!d'G d(d) d)eZ0dS )+    N)defaultdict)
check_typecheck_variable_and_dtype)OperatorDistAttr)get_world_process_group)is_backward_opis_forward_op6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_var_dist_attr)OP_ROLE_KEYOpRole)core)default_main_programdefault_startup_program)#_keep_layer_norm_scale_bias_to_fp32)unique_name   )ProcessMesh   )AMPPass)register_pass)Zcreate_py_readerZcreate_double_buffer_readerwhilecastc                 C   s   |  dr| dtjjjkr| dt |  dr*| dtjjjkr*| dt |  dr?| dtjjjkr?| dt ttjjjkr^|  drQ| dd |  dr`| dd d S d S d S )Nin_dtype	out_dtypedtypeZ
use_mkldnnTZmkldnn_data_typebfloat16)	has_attrattrr   VarDescVarTypeFP32	_set_attr__target_dtype__BF16)op r&   m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/passes/auto_parallel_fp16.pyset_op_dtype_to_fp168   s"   

r(   c                 C   s   | j }|dkr|dkS |dkrt r|dkS |dkr|dvS |dkr&|dvS |dv r.|d	v S |d
v r6|dvS |dv r>|dvS dS )N
batch_normX
layer_normfused_bn_add_activation>   r*   Zresnet_unit>   r*   r-   ZFilterZZFilterXZfused_attentionZfused_feedforward>   ZLn1ScaleZLn2BiasZLnBiasZLn2ScaleZLn1BiasZLnScaleZbatch_norm_grad>   r*   zY@GRADZlayer_norm_gradFtyper   )r%   in_nameop_typer&   r&   r'   _keep_fp32_inputO   s    	r6   c                 C   sp   | j }|dv r|dkS |dkrt r|dkS |dkr|dvS |dv r&|dv S |dv r.|d	kS |d
v r6|d	kS dS )N)r)   r,   Yr+   r.   >   r7   ZConvZZConvXr/   >   ZLn2VarianceZLnMeanZ
LnVarianceZLn1MeanZLn2MeanZLn1Variancer1   zX@GRADr0   Fr2   )r%   out_namer5   r&   r&   r'   _keep_fp32_outputl   s   	r9   c                   @   sX   e Zd Z	dddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd ZdS )	FP16StateNc                 C   s\   || _ || _|| _|| _| jjj| _|r|| _ng | _i | _i | _	t
t| _d| _i | _d S )NF)programamp_listuse_fp16_guarddist_contextdist_op_contextgrad_op_id_to_op_idgrad_op_to_op_mapinput_data_var_names_op_fp16_dictforward_non_leaf_tensorsr   listforward_input_cast_opsis_trainout_var_op_deps)selfr;   r<   r>   r=   rB   r&   r&   r'   __init__   s"   
zFP16State.__init__c                 C   s   | j |d S N)rC   get)rI   op_idr&   r&   r'   _is_fp16_op      zFP16State._is_fp16_opc                 C   s   | j jD ].}|jD ](}|jD ]}|| jvr|j g| j|< q| j| |j g q| | q	q| j jD ]}| 	| q7| j jD ]}| 
| qC| jS )zy
        mark the execution mode (fp16 or fp32) for ops in all blocks
        include forward ops & backward ops
        )r;   blocksopsoutput_arg_namesrH   descoriginal_idextend_mark_opresolute_tensor_dtype
cast_blockrG   )rI   blockr%   namer&   r&   r'   _build_state   s   




zFP16State._build_statec                 C   sx  |j tv rd S t|r||j dkr!d|jd v r!d| j|j < d S |j dkrR|jd }t| j	| dkrR| j| j	| d  sHd| j|j < d S d| j|j < d S t
|| jj| jred| j|j < nd| j|j < |jD ]
}|j | j|< qpn0t|ttjkr|j | jv r| j|j  }|| jv sJ t| | j| | j|j < t|ddkrd| _d S d S )	NZassignarray_r   Fr   Top_rolei  )r3   __amp_skip_ops__r   input_arg_namesrC   rS   rT   rR   lenrH   __amp_utils__Z_need_keep_fp32r<   Zunsupported_listr=   idrD   r   intr   BackwardrA   strr   rG   )rI   r%   r8   var_nameZ	fwd_op_idr&   r&   r'   rV      sD   




zFP16State._mark_opc              
   C   s   d }z| |}W n ty  } z||}W Y d }~nd }~ww |d u s/|jtjvs/d|v r1d S |jtjj	j
krA|jt d S d S )Nr\   )var
ValueErrorZ_var_recursiver3   ra   _valid_typesr   r   r   r    r!   rS   	set_dtyper#   )rI   rf   rY   rg   er&   r&   r'   set_var_to_fp16   s   zFP16State.set_var_to_fp16c                 C   s  |j D ]}t|r| |j du s|jdkr[|jD ] }t||r#q||D ]}|| j	vr:|| j
vr:| || q(q|jD ]}t||rGq?||D ]}| || qLq?t| q| |j du r|jD ]!}|j|}|d u sz|jtjvr{qh|jtkr|jtjjj qhqt|r| |j du s|jdkr|jD ]}t||rq||D ]}| || qqt| q| |j du r|jD ]!}|j|}|d u s|jtjvrq|jtkr|jtjjj qqd S )NTr   F)rQ   r   rN   rS   rT   r3   input_namesr6   inputrD   rB   rl   output_namesr9   outputr(   rR   varsrL   ra   ri   r   r#   rj   r   r   r    r!   r   )rI   rY   r%   r4   in_var_namer8   out_var_nameout_varr&   r&   r'   rW     sb   
















zFP16State.resolute_tensor_dtypec           
      C   s  | j j}d}|t|jk r|j| }d}|jtv r|d7 }qt|rT| |j	 du r;| 
|||ttjjj| j }n| |j	 du rS| 
|||tjjjt| j }n~t|r|j	 |jv r| |j	 du ry| |||ttjjj| j }nY| |j	 du r| |||tjjjt| j }n@|jdkr|jd }|jd }||}||}	|jD ]}|	j||jksJ |	 d|| dt| q|j|	j ||d 7 }|t|jk s|  d S )Nr   r   FTsum, )r>   r?   r`   rQ   r3   r^   r   rN   rS   rT   _insert_forward_cast_opsr#   r   r   r    r!   r   r@   _insert_backward_cast_opsrR   r_   rg   _find_var_recursiver   re   rj   _sync_with_cpp)
rI   rY   r?   idxr%   num_cast_opsrs   rr   rt   in_varr&   r&   r'   rX   <  s|   







;zFP16State.cast_blockc                 C   s  d}|j D ]}|tjjjkrt||rq||}	|	d usJ ||D ]}
||
}|d u s:|j	t
jvs:|j|kr;q$|j|kr|jd t
| }|j|}| j|j   ||j|||fg7  < t|	|j}|d usrJ |d u s{|j|kr|j}|j}|j||d|jd}t|||| d}|dr|d}|j|dd|id	|id
|jd|jtt j!id}|"d| t#|||| |d7 }|$|j| |	%|| q$q|dr|ddkr|d|ksJ |S )Nr   z.cast_F)rZ   r   persistablestop_gradient/op_namescoper   r*   Outr   r   r3   inputsoutputsattrsr   )&rm   r   r   r    r!   r6   get_op_dist_attr_for_programrn   ry   r3   ra   ri   r   rZ   Z_dtype_to_strrq   rL   rF   rS   rT   copydeepcopyget_input_dist_attrprocess_meshdims_mapping
create_varr   r
   r   r   _insert_op_without_syncr   r   ZForwardr"   r	   _rename_inputset_input_dist_attr)rI   r%   r{   rY   	src_dtype	dst_dtyper>   r|   r4   Zconsume_op_attrrr   r}   	cast_nameZcast_varZin_var_dist_attrref_meshref_mappingr   cast_opr&   r&   r'   rw   |  s   







Ez"FP16State._insert_forward_cast_opsc                 C   sD  d}|j  }|j  }	|j}
|
j|	 }||}|d usJ |jD ]}||}t||j	r0q"|j
|ks@J t| d| q"| j| D ]\}}}}}||jv r|||v shJ d| d| dt| ||}|d ussJ ||| ||| |d }||jv rt||dkrqFt||dksJ d| d	t| ||d }||}||}|d usJ | |j}|j}|jtd
|dg||j|j|j|jd}| || |!||j	 |"|j	| |j#|d dd|j	gid|j	gid|d|t$t%j&id}|j '| t(|||| |d7 }qF|S )Nr   rv   zvar: z not in op's z. z@GRADr   [z], Current Op:  rZ   r   shaper3   r~   r   r   r*   r   r   r   r   ))rS   rb   rT   r?   r@   r   rR   rg   r9   rZ   r   re   rF   rm   rn   r   r   r   ro   r`   rp   Zget_output_dist_attrr   r   r   r   generate_with_ignorable_keyjoinr   r3   r~   r   Z set_tensor_dist_attr_for_programZ_rename_outputZset_output_dist_attrr   r   r   rd   rj   r	   )rI   r%   r{   rY   r   r   r>   r|   rM   rT   r?   Zforward_op_idZgrad_op_attrrs   rt   r   src_nameZ	slot_nameZsrc_var_dist_attrZgrad_slot_nameZ	grad_nameZgradZgrad_dist_attrr   r   Z	cast_gradr   r&   r&   r'   rx     s   





"






z#FP16State._insert_backward_cast_opsrK   )__name__
__module____qualname__rJ   rN   r[   rV   rl   rW   rX   rw   rx   r&   r&   r&   r'   r:      s    
*9@Wr:   c                 C   s8  t j  }|  t| dttfd | D ]}t|dg dd q|j	t
dd|gdgdtjjjddd	}t||d
gtj | |d}| |d}dtji}	|jd|||	d}
t|
j}ttj|_d|_ttjdkrrd|_| D ]}||}|d usJ ||j |j! |"|j |j! qt|#|
| | |fS )NxZcheck_finite_and_unscale)float16Zfloat32Zfloat64.find_infinite_scaler   boolF)rZ   r   r   r3   r~   r   r   )r*   ZScale)r   ZFoundInfiniter]   r   r   )$paddlestaticr   global_blockrz   r   tuplerE   r   r   r   r   r   r   r   r    
LOD_TENSORr
   world_process_groupranksr   ZOptimize	append_opr   rS   r   r   impl_idxr`   Z	impl_type get_tensor_dist_attr_for_programset_input_dims_mappingrZ   r   set_output_dims_mappingset_op_dist_attr_for_program)gradsZloss_scalingrZ   r>   Z
main_blockrk   	found_infr   r   r   new_opnew_op_dist_attrgZg_dist_attrr&   r&   r'   _check_and_update_gradient/  s\   




	
r   c                 C   sT   dd | D }dd |D }dd |D }t |t | t |ks%J d|||fS )Nc                 S   s   g | ]\}}|qS r&   r&   ).0_r   r&   r&   r'   
<listcomp>g  s    z _split_grads.<locals>.<listcomp>c                 S   s    g | ]}|j tjjjkr|qS r&   )r   r   r   r    r!   r   r   r&   r&   r'   r   h  s     c                 S   s   g | ]	}|j tkr|qS r&   )r   r#   r   r&   r&   r'   r   i  s    z4Data types of all grads must be either fp16 or fp32.)r`   )params_gradsr   
fp32_grads
fp16_gradsr&   r&   r'   _split_gradsf  s   
r   c                 C   s   t  }t||_d|_| jD ]}||}||}|d us J |||j q| j	D ]}||}||}|d us=J |
||j q+|| | d S )Nr   )r   r   r   r   r_   rg   r   r   r   rR   r   r   )r   r   rY   r>   r   rf   rg   Zvar_dist_attrr&   r&   r'   _set_op_dist_attr_with_ranksp  s$   






r   c                 C   sB   t | jD ]\}}|jdkr|jd |jkr|d   S qtd)N
reduce_anyr   r   z=not found the correct location for memcopy for found_inf_var.)	enumeraterQ   r3   rR   rZ   RuntimeError)rY   Zfound_inf_varr{   r%   r&   r&   r'   _get_memcopy_idx  s   
r   D2Hc           
      C   s   |j }| jt|dg|j|jtjj	j
d|jd}t||dd |jD tj |dkr0d}ntd| d	d
|i}| j|dd|gid|gi|d}	t|	tj| | |   |S )NZmemcopy_Fr   c                 S      g | ]}d qS r   r&   r   ir&   r&   r'   r         z#_insert_memcopy.<locals>.<listcomp>r   r   zdirection [z] is not supported yet.dst_place_typeZ
memcpy_d2hr*   r   )indexr3   r   r   r   )rZ   r   r   r   r   r   r   r   r   r    r   r   r
   r   r   NotImplementedErrorr   r   rz   )
rY   r{   Zsrc_varr>   	directionr   Z
output_varr   r   r   r&   r&   r'   _insert_memcopy  sF   


r   c            	      C   s   t  } t }i }| jD ]}| D ]}|j||j< qqdd }| jD ]G}||rk|jd }|	|d t
krk|dsFJ dt| d| |}|jtjjjkr[|jt
 |dtjjjkrk|dt
 q$d S )Nc                 S   s<   d}| j }||rdS t| jdkrt| jdkrdS dS )NZc_Fr   r   T)r3   
startswithr`   rR   r_   )r%   Zcomm_op_prefixr5   r&   r&   r'   is_initialization_op  s   
z2cast_startup_program.<locals>.is_initialization_opr   r   z>initialization op is supported to has dtype attribute but got r   )r   r   rP   Zall_parametersr   rZ   r   rQ   rR   rL   r#   r   re   rg   r   r   r    r!   rS   rj   r   r"   )	main_programstartup_programZparam_to_dtyperY   pr   r%   Zoutput_namert   r&   r&   r'   cast_startup_program  s0   

r   Zauto_parallel_fp16c                       s$   e Zd Z fddZdd Z  ZS )FP16Passc                    s   t    d S rK   )superrJ   )rI   	__class__r&   r'   rJ     rO   zFP16Pass.__init__c                 C   s  |  d| _|  d| _|  d}|  dd | _| jd u r&|  dd dk| _| jdkr@dd lm  m  m} |j}t	j
jj}n| jd	krTdd
lm} |j}t	j
jj}n	td| j d|a|a|t|  dt|  dd }dd |  dD }	tj||# t||| j|  d|	}
|
 }t  |r| | j W d    n1 sw   Y  |rg| jdkrtj||D |   |   t|\}}}|  ds|  ddkrg }|r|g  t|| j d| j\}}W d    n1 sw   Y  |!| |r.|g  t|| j d| j\}}W d    n	1 s$w   Y  |!| |g  |" }|j#tj$j%&d'ddg|d j(d |d j)|d j*ddd}|j+dd|id|giddid }t,| j|d!gt-j. t/|t-j.|| j |j#tj$j%&d'd"dg|j(d |j)|j*ddd}|j+d#d|id|idgdd$d%d }t,| j|d&d |j0D t-j. t/|t-j.|| j W d    n	1 sw   Y  |  dr|g  |r| 1|| |r| 1|| W d    n	1 sw   Y  W d    n	1 sw   Y  |  d'}d$|_2| jrd|_2| jdkrit3|tj4j5tj4j6frX|g  t7||}t8|||| j}W d    n	1 sJw   Y  |9d(|j: d S t;|d)rk|9d(|j: d S d S d S d S )*Nr>   r   r   use_optimizer_fp16levelZo3r   r   r   )	amp_utilsztarget dtype [z"] is for amp o2 not supported yet.Zcustom_white_listZcustom_black_listc                 S   s   g | ]}|j qS r&   )rZ   )r   rg   r&   r&   r'   r     s    z/FP16Pass._apply_single_impl.<locals>.<listcomp>Z
input_datar=   Zuse_dynamic_loss_scalingZinit_loss_scalingg      ?z@fp32z@fp16r   concattmpF)rZ   r   r   	lod_levelr3   r~   r   r*   r   Zaxisr   r   r   r   T)dimZkeep_dimZ
reduce_allc                 S   r   r   r&   r   r&   r&   r'   r   {  r   base_optr   _set_auxiliary_var)<Zget_attrr>   Ztarget_dtyper   paddle.static.amp.fp16_utilsr   ampZ
fp16_utilsZAutoMixedPrecisionListsr   r   r    ZFP16Zpaddle.static.amp.bf16r   ZAutoMixedPrecisionListsBF16r$   r   r#   ra   setr   Zprogram_guardr:   r[   r   Z
_cast_lossZ_init_amp_varZ_scale_lossr   Z_optimized_guardr   Z_loss_scalingappendr   r   utilsr   r   r   r   r   r3   r   r
   r   r   r   r   Z_update_loss_scalingZ_multi_precision
isinstanceZ	optimizerZAdamZAdamWr   r   r   rZ   hasattr)rI   r   r   contextr   r   ZAMPListZ_FP16Pass__target_dtyper<   rB   Z
fp16_staterG   r   r   r   Z
found_infsr   Zfound_inf_fp32Zfound_inf_fp16rY   Zall_infsZ	concat_opr   Zreduce_any_opr   Z
insert_idxr&   r&   r'   _apply_single_impl  sJ  



	




E
k

rzFP16Pass._apply_single_impl)r   r   r   rJ   r   __classcell__r&   r&   r   r'   r     s    r   )r   )1r   collectionsr   r   Zpaddle.common_ops_importr   r   Z6paddle.distributed.auto_parallel.static.dist_attributer   Z5paddle.distributed.auto_parallel.static.process_groupr   Z-paddle.distributed.auto_parallel.static.utilsr   r   r	   r
   Z/paddle.distributed.fleet.meta_optimizers.commonr   r   Zpaddle.frameworkr   Zpaddle.staticr   r   r   r   Zpaddle.utilsr   Zauto_parallel.process_meshr   Zauto_parallel_ampr   Z	pass_baser   r   r^   r#   ra   r(   r6   r9   r:   r   r   r   r   r   r   r   r&   r&   r&   r'   <module>   sD      -7

+"