o
    "jk$                     @   s`   d dl Z d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZ G dd deZeZdS )    N)no_grad)core)clip)ClipGradBase_squared_l2_normc                       sL   e Zd ZdZ			d fdd	Zdd Zeddd	Ze d
d Z	  Z
S )ClipGradForMOEByGlobalNorma
  
    The Algorithm is the same as paddle.nn.ClipGradByGlobalNorm
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
    :math:`t\_list` , and limit it to ``clip_norm`` .

    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.

    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.

    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::

        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.

    Reference:
        https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch
        Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4


    Args:
        clip_norm (float): The maximum norm value.
        is_expert_param_func (function): a function to decide whether a param should be put into moe_params_grads
        moe_group (Group): group for moe experts communication.
        group_name (str, optional): The group name for this clip. Default value is ``default_moe_group``.

    Examples:

        .. code-block:: python

            >>> import paddle

            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Cause paddle.nn hasn't this interface, so we use ClipGradByGlobalNorm here.
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    Ndefault_moe_groupc                    sL   t    t|| _|| _|| _|d ur!|jdkr!|d us!J d|| _d S )N   zRWhen moe group size > 1, a function for selecting expert params must be specified.)super__init__float	clip_norm
group_name	moe_groupnranksis_expert_param_func)selfr   r   r   r   	__class__ q/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/incubate/distributed/models/moe/grad_clip.pyr   U   s   



z#ClipGradForMOEByGlobalNorm.__init__c                 C   s
   d| j  S )Nz+Gradient Clip By GlobalNorm, global_norm=%f)r   )r   r   r   r   __str__f   s   
z"ClipGradForMOEByGlobalNorm.__str__c                 C   s  g }g }g }| D ]K\}}|d u rqt |dddu rq|}|jtjjjkr.t|}t|}t	|}|j
tjjjkr@|| q|j
tjjjkrN|| q|| qt|t| t| dkrddS |dv slJ d|dkrzt|dkrxdnd	}g }	t|dkrt|}
|	|
| t|dkrt|}|d	kr|	| n|	|| t|dkrt|}|	| t|	}	|	|fS )
N	need_clipTFr   )NN)float64float32Nz*sum's type must be float64/ float32 / Noner   r   )getattrtyper   VarDescVarTypeZSELECTED_ROWSr   Zmerge_selected_rowsZget_tensor_from_selected_rowsr   dtypeFP16appendZFP32lenpaddleZadd_nastype)params_grads	sum_dtypeZsum_square_listZsum_square_list_fp16Zsum_square_list_fp32pgZ
merge_gradZ
sum_squareglobal_norm_varZglobal_norm_var_fp16Zglobal_norm_var_fp32Zglobal_norm_var_fp64r   r   r   get_l2_norm_powi   sZ   







z*ClipGradForMOEByGlobalNorm.get_l2_norm_powc                 C   s  g }g }| j d ur+| j jdkr+|D ]\}}| |r"|||f q|||f qn|}| |\}}d }t|dkrS| ||\}}	|d urStj|tjj	| j d |d u r]|d u r]|S |d u rd|}
n|d u rk|}
n|j
|j
krw||j
}|| }
g }t|
}
tjdg|
j
| jd}tj|tj|
|dd}|D ]6\}}|d u rqt|dddu r|||f q|j
tjjjkr|d	n|}tj||d}|||f q|S )
Nr	   r   )opgroup)shaper   Z
fill_value)xyr   TFZfloat16)r   r   r   r!   r*   r"   distZ
all_reduceZReduceOpZSUMr   r$   r#   sqrtfullr   dividemaximumr   r   r   r   r    multiply)r   r%   Znormal_params_gradsZmoe_params_gradsr'   r(   Zglobal_norm_var_normalr&   Zglobal_norm_var_moe_r)   Zparams_and_gradsZmax_global_normZclip_varZ
clip_inputZnew_gradr   r   r   _dygraph_clip   sp   


z(ClipGradForMOEByGlobalNorm._dygraph_clip)NNr   )N)__name__
__module____qualname____doc__r   r   staticmethodr*   r   r7   __classcell__r   r   r   r   r      s    @5r   )r#   Zpaddle.distributeddistributedr0   Zpaddle.autogradr   Zpaddle.frameworkr   Z	paddle.nnr   Zpaddle.nn.clipr   r   r   ZClipGradByGlobalNormr   r   r   r   <module>   s    S