o
    "jsQ                     @   s   d dl mZ d dlZd dlZd dlmZmZ ddlm	Z	 ddl
mZmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZmZ dd Zdd Z dd Z!dd Z"G dd dZ#edG dd deZ$dS )    )reduceN)OP_ROLE_KEYOpRole   )ProcessMesh)OperatorDistAttrTensorDistAttr)SyncModeis_data_parallel_reduce_op)get_all_process_groupsget_world_process_group)	Resharder)_get_comm_groupinsert_dependencies_for_varsis_gradient_clip_opis_optimize_op   )ShardingPass)PassBaseregister_passc                 C   sx   g }t | jD ]2}t|s |S d|jv r9d|jv r9|dd }|dd }| |}| |}|||f q|S )NParamZGradr   )reversedopsr   Zinput_namesinputvarappend)blockparams_gradsop
param_nameZ	grad_nameparamZgrad r!   r/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/passes/auto_parallel_grad_clip.py_get_params_grads.   s   

r#   c                 C   s   d}| d |j  |j g}|d dkrd}|dd }tdd |d}tdd | d}||kr4|}||fS || dks<J || }||g }||fS )a  
    Get dpmp topology from origin_topology

    Example:
        the parallel strategy: dp4-mp2-sharding2
        the complete process_mesh:
            topology: [4, 2]
            processes: [0, 1, 2, 3, 4, 5, 6, 7]
        the dpmp topology: [2, 2]
        the sharding axis: 1
    r   r   Nc                 S      | | S Nr!   xyr!   r!   r"   <lambda>Q       z$_get_dpmp_topology.<locals>.<lambda>c                 S   r$   r%   r!   r&   r!   r!   r"   r)   R   r*   )nranksr   )Zorigin_topologysharding_groupsharding_axisZdp_sharding_topologyZproduct_dp_shardingZproduct_topologydpmp_topologyZ	mp_degreer!   r!   r"   _get_dpmp_topology<   s    
r/   c                 C   s   |du r||fS t ||\}}g }|D ]}t||||}||vr%|| qt|}d}	t|jd D ]}
| |dd|
f v rH|dd|
f }	q4|	dusOJ |t|	fS )a  
    Get dpmp process_mesh from the complete process_mesh which apply sharding.

    Example:
        the parallel strategy: dp4-mp2-sharding2
        the complete process_mesh:
            topology: [4, 2]
            processes: [0, 1, 2, 3, 4, 5, 6, 7]
        the dpmp process_mesh is:
            1) topology: [2, 2], processes: [0, 1, 4, 5]
            2) topology: [2, 2], processes: [2, 3, 6, 7]
    N)r/   r   r   nparrayrangeshapelist)rank_idtopology	processesr,   r.   r-   Zsharding_groupsrankgroupZdpmp_processes_in_shardingir!   r!   r"   _get_dpmp_process_mesh^   s"   

r<   c                 C   sh   t | |||\}}t|||}g }	g }
|D ]}t|||||}||	vr/|	| |
| q| |
v S r%   )r<   r   Zcompute_complete_shapeZcompute_partition_indexr   )r6   Ztensor_shaper7   r8   dims_mappingr,   r.   Zdpmp_processesZcomplete_shapeZcomplete_partitionsZcomplete_param_ranksprocessZpartition_indexr!   r!   r"   _is_about_global_norm   s"   


r?   c                   @   sL   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )
ClipHelperc                 C   s   t | \}}t|| _dd | jD | _|| _|| _|| _|| _d | _t	 j
| _t|dr1|j| _t| j| _|  | _| || _d S )Nc                 S      g | ]}|j qS r!   name.0pr!   r!   r"   
<listcomp>       z'ClipHelper.__init__.<locals>.<listcomp>_sharding_group)zipr5   paramsparams_namer6   r   dist_contextpass_contextr,   r   Zranksworld_rankshasattrrI   lenworld_nranks_is_pure_data_parallelpure_data_parallel_partition_parametersrank_to_params)selfr   r6   r   rM   rN   rK   _r!   r!   r"   __init__   s   



zClipHelper.__init__c                 C   sp   |  |sdS | j| j| }| js/| |}|jj}|jj}|j	}t
| j|j|||| jS |j| j| j v S )zZ
        whether the param_name@GRAD paticipate in the calculation of global_norm
        F)is_local_paramrK   rL   indexrT   _get_dist_attrprocess_meshr4   process_idsr=   r?   r6   r,   rC   rV   )rW   rC   r    	dist_attrr7   r8   r=   r!   r!   r"   is_calcuate_norm   s"   

	zClipHelper.is_calcuate_normc                 C   s   || j vrdS dS )zH
        whether the param_name is updated with opt in cur_rank
        FT)rL   )rW   rC   r!   r!   r"   rZ      s   
zClipHelper.is_local_paramc                 C   s   | j j| }| j|S r%   )r   varsrM   Z get_tensor_dist_attr_for_program)rW   rC   r   r!   r!   r"   r\      s   zClipHelper._get_dist_attrc                 C   s$   |  |}|dusJ | j|jjv S )z<
        whether the var_name is belong to cur_rank
        N)r\   r6   r]   r^   )rW   rC   r_   r!   r!   r"   is_local_var_with_dist_attr   s   
z&ClipHelper.is_local_var_with_dist_attrc           	      C   s   t  }t| j|_|jD ]'}| jj| }t }t| j|_dd |jD |_	| j
|| ||| q|jD ]'}| jj| }t }t| j|_dd |jD |_	| j
|| ||| q7| j
|| d S )Nc                 S      g | ]}d qS r0   r!   rE   r;   r!   r!   r"   rG          z.ClipHelper._init_dist_attr.<locals>.<listcomp>c                 S   rc   rd   r!   re   r!   r!   r"   rG      rf   )r   r   rO   r]   input_arg_namesr   ra   r   r4   r=   rM   Z set_tensor_dist_attr_for_programZset_input_dist_attroutput_arg_namesZset_output_dist_attrZset_op_dist_attr_for_program)	rW   r   Zop_dist_attrZin_nameZin_varZin_dist_attrZout_nameZout_varZout_dist_attrr!   r!   r"   _init_dist_attr   s*   

zClipHelper._init_dist_attrc                 C   sz   | j jD ]
}t|tr dS qt }|D ]}|j| jkr dS q| jjD ]}|j	dv r2t
|s2 dS |j	dv r: dS q$dS )NF)Zc_reduce_sumc_allreduce_sum)Zsend_v2Zrecv_v2T)rN   Zpasses
isinstancer   r   r+   rR   r   r   typer
   )rW   Zapplied_passgroupsgr   r!   r!   r"   rS      s$   


z!ClipHelper._is_pure_data_parallelc                 C   s   i }| j st| jD ]}dd |D ||< q
|S t| jD ]}g ||< qdg| j }|D ]2}|t|}|| |j tdd |jd}|dksVJ d|j d| d	||  |7  < q,|S )
z
        build rank_id_to_params by the param's numel
        to guarantee params in every rank of dp_group as even as possible.
        c                 S   rA   r!   rB   rD   r!   r!   r"   rG     rH   z4ClipHelper._partition_parameters.<locals>.<listcomp>r   c                 S   r$   r%   r!   r&   r!   r!   r"   r)     r*   z2ClipHelper._partition_parameters.<locals>.<lambda>r   zparam [z#] should larger than 0, but it is [])	rT   r3   rR   r[   minr   rC   r   r4   )rW   rK   mappingZrank_sizesr    r9   Znumelr!   r!   r"   rU     s"   

z ClipHelper._partition_parametersN)__name__
__module____qualname__rY   r`   rZ   r\   rb   ri   rS   rU   r!   r!   r!   r"   r@      s    r@   Zauto_parallel_grad_clipc                       s@   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Z  Z	S )ClipGradByGloblNormPassz
    1. Remove norm-compute op and grad-scale op when the grad is not in current rank
       or is independent of the calculation of norm.
    2. Each rank computes its own norm value, then gets global_norm by allreduce_sum only once.
    c                    s2   t    | dd  | dd  | dd  d S )Nr6   rM   r   )superrY   Zset_attr)rW   	__class__r!   r"   rY   (  s   
z ClipGradByGloblNormPass.__init__c                 C   sB   |  dd u r	dS |  d}|jjd u rdS |  dd u rdS dS )NrM   Fr   T)get_attrZ_serial_optimizerZ
_grad_clip)rW   rM   r!   r!   r"   _check_self.  s   
z#ClipGradByGloblNormPass._check_selfc                 C   s   dS )NTr!   )rW   Z
other_passr!   r!   r"   _check_conflict8  s   z'ClipGradByGloblNormPass._check_conflictc                 C   sL   |  dd }|  dd }| }|  dd }t|||||| _| | d S )NrM   r6   r   )rz   Zglobal_blockr@   clip_helper_remove_no_need_ops_vars)rW   Zmain_programZstartup_programcontextrM   r6   r   Zdist_params_gradsr!   r!   r"   _apply_single_impl;  s   
z*ClipGradByGloblNormPass._apply_single_implc                 C   s  g d}t  }t  }t|jD ]\}}t|sq|jdkrK|dd }|ddkrJ|d |d }| j|}	|	sJ|	| |
t |j q|j|v r|dd }|ddkr|d |d }| j|}	| j|}
|	rw|
s|	| |
t |j q|d |v r|	| |
t |j q|jdkr|dd }|ddkr|d |d }| j|}	|	s|	| |j|d  jd	kr|	|d  |
t |j|d  j q|jd
kr6g }|jD ]}||vr| j|r|| q|s.|	| |
t |j |j|d  jd	kr-|	|d  |
t |j|d  j q|jd| q|jdkrg }|jD ]}||vrT| j|rT|| qA|s|	| |
t |j |j|d  jdkr|	|d  |
t |j|d  j |j|d  jd	kr|	|d  |
t |j|d  j q|jd| qttt|jD ]\}}t|s nt|sȐq||v r|j|dd qttt|jD ]\}}t|s nt|sq|jdkr|dd }|j| }d}tj dkrd}||v rE|| |j|di d|giddgd|jddddttjid}| dd |d7 }| j!| d}|j|| dd|gid|giddddttjid}| ddt"j#  | j!| |r|d }d }|dkr|j| j}|dv s|$d r|j| }n	|d8 }|dks{|d usJ d!|j|jd  }|d usJ d"t%||||| jj&tjdgddd#d$
 q|D ]
}|j'|dd q|(  d S )%N)Zsquared_l2_normZsquare
reduce_sumZclip_by_normXr   z@GRADr0   r   Zelementwise_mulcastsumstackr   r   F)syncsqrtZfill_constantZOutr4   dtypevalueZ	force_cpu)rl   ZinputsZoutputsattrsop_namescopez/gradient_clip_passTrj   Zring_idZuse_calc_stream/)Zupdate_loss_scalingZcheck_finite_and_unscaleZ_gradz<Unexpected: ClipByGlobalNorm could not find priory depend opz=Unexpected: ClipByGlobalNorm could not find priory depend varZgrad_clip_fill_constant_dep)r]   Zis_recomputer   r   ))set	enumerater   r   rl   r   findr}   rZ   addupdaterh   r`   rg   rb   r   descZ	set_inputr   r5   r   Z
_remove_opra   paddledistributedZget_world_sizeremoveZ
_insert_opr   r   r   ZOptimizeZ	_set_attrri   r	   ZGlobalNormSyncendswithr   rM   Z_remove_varZ_sync_with_cpp)rW   r   Zremoved_op_out_typeZremoved_op_idxZremoved_tmp_varidxr   Z
input_namer   is_localZis_calculateZreserved_varsZ	input_varZinsert_leaf_fill_constant_nodeoffsetZfill_constant_opZallreduce_opjZprior_opZop_typeZ	prior_varvarnamer!   r!   r"   r~   G  sb  



























z0ClipGradByGloblNormPass._remove_no_need_ops_vars)
rs   rt   ru   __doc__rY   r{   r|   r   r~   __classcell__r!   r!   rx   r"   rv      s    
rv   )%	functoolsr   numpyr1   r   Z/paddle.distributed.fleet.meta_optimizers.commonr   r   Zauto_parallel.process_meshr   Z#auto_parallel.static.dist_attributer   r   Z%auto_parallel.static.operators.commonr	   r
   Z"auto_parallel.static.process_groupr   r   Zauto_parallel.static.reshardr   Zauto_parallel.static.utilsr   r   r   r   Zauto_parallel_shardingr   Z	pass_baser   r   r#   r/   r<   r?   r@   rv   r!   r!   r!   r"   <module>   s(   "% 