o
    #j                     @   s  d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZmZmZmZ d d	lmZ g Ze d/d
dZe d/ddZe d/ddZda dd Z!dd Z"dd Z#G dd dZ$G dd de$Z%dd Z&G dd dZ'G dd  d e'Z(G d!d" d"e'Z)da*d#d$ Z+da,d%d& Z-G d'd( d(e'Z.ej/d0d)d*Z0d+d, Z1d-d. Z2e'Z3e(Z4e)Z5e.Z6dS )1    N)NotSupportedError)_C_ops)core	frameworkunique_name)check_variable_and_dtype)DataType)Variable
check_typedefault_main_program)LayerHelperin_dynamic_modein_dynamic_or_pir_modein_pir_mode)templatedocc                 C   s   t  r	t| |S tdi t }t| dg dd t|dtd |du r0t	d
|jdg}|j| j|| jdd	}|jdd| id|id
|id |S )a@  
    ${comment}

    Args:
        x(${x_type}): ${x_comment}
        max_norm(${max_norm_type}): ${max_norm_comment}
        name(str, optional): For detailed information, please refer
            to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.

    Returns:
        Tensor:

        out(${out_type}): ${out_comment}


    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.nn import clip

            >>> input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
            >>> reward = clip.clip_by_norm(x=input, max_norm=1.0)
            >>> print(reward)
            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[0.50000000, 0.50000000],
             [0.50000000, 0.50000000]])
    clip_by_normX)float16float32uint16max_normN.tmpF)typenamedtypeZpersistableOutr   inputsattrsoutputs)r   )r   r   r   r   localsr   r
   floatr   Zgenerate_with_ignorable_keyjoinr   Zcreate_variabler   r   	append_op)xr   r   helperout r(   O/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/nn/clip.pyr   %   s*   !r   c                 C   sL   t  rt| S tdi t }|j| jd}|jdd| ii d|id |S )aQ  
    ${comment}

    Args:
        x(${x_type}): ${x_comment}
        name(basestring|None): Name of the output.

    Returns:
        out(${out_type}): ${out_comment}

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> import paddle.base as base

            >>> b = paddle.static.default_main_program().global_block()
            >>> var = b.create_var(
            ...     name="X", dtype="float32", persistable=True,
            ...     type=base.core.VarDesc.VarType.SELECTED_ROWS)
            >>> y = paddle.nn.clip.merge_selected_rows(var)
    merge_selected_rowsr   r   r   r   N)r*   )r   r   r*   r   r!   "create_variable_for_type_inferencer   r$   r%   r   r&   r'   r(   r(   r)   r*   b   s   
r*   c                 C   sr   t  rt| S t| dtd | jtjjj	krt
dtd	i t }|j| jd}|jdd| id|ii d |S )
a?  
    Get tensor data from input with SelectedRows type, and outputs a Tensor.

    .. code-block:: text

        input x is SelectedRows:
           x.rows = [0, 5, 5, 4, 19]
           x.height = 20
           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]

        Output is LoDTensor:
           out.shape = [5, 2]
           out.data = [[1, 1],
                       [2, 2],
                       [2, 2],
                       [3, 3],
                       [6, 6]]

    Args:
        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name` .

    Returns:
        Variable: LoDTensor transformed from SelectedRows. The data type is same with input.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import paddle.base as base
            >>> from paddle.base import core
            >>> paddle.enable_static()
            >>> scope = core.Scope()
            >>> block = paddle.static.default_main_program().global_block()
            >>> x_rows = [0, 5, 5, 4, 19]
            >>> height = 20
            >>> x = scope.var('X').get_selected_rows()
            >>> x.set_rows(x_rows)
            >>> x.set_height(height)
            >>> x = block.create_var(name="X", dtype="float32", persistable=True, type=base.core.VarDesc.VarType.SELECTED_ROWS)
            >>> z = paddle.nn.clip.get_tensor_from_selected_rows(x)
    r%   get_tensor_from_selected_rowszGThe type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS.r+   r   r   r   r   r    r   N)r.   )r   r   r.   r
   r	   r   r   VarDescVarTypeSELECTED_ROWS	TypeErrorr   r!   r,   r   r$   r-   r(   r(   r)   r.      s    -
r.   Fc                  G   sB   t | dksJ t | dkrt| d tsJ t}| d a|S tS )N   r   )len
isinstancebool'_clip_by_global_norm_using_mp_type_flagargs	old_valuer(   r(   r)   "_clip_by_global_norm_using_mp_type   s   r<   c                 C   sd   | j tjjjks| j tjjjkrt r| tjjjS | j t	j
ks'| j t	jkr0t r0| t	jS | S N)r   r   r0   r1   FP16BF16r<   astypeFP32r   FLOAT16BFLOAT16r%   r(   r(   r)   _cast_to_mp_type_if_enabled   s   rE   c                 C   sp   t | } t rt| S d}t| dg d| t|fi t }|| j}d| i}d|i}|j	|||d |S )z1
    Return the squared L2 norm of a tensor.
    squared_l2_normr%   )r   float64r   r   r   r   r   r   r    )
rE   r   r   rF   r   r   r!   r,   r   r$   )r%   Zop_typer&   r'   r   r    r(   r(   r)   _squared_l2_norm   s   
rI   c                   @   s   e Zd Zdd Zdd ZdS )BaseErrorClipAttrc                 C      t  r=   NotImplementedErrorselfr(   r(   r)   __str__      zBaseErrorClipAttr.__str__c                 C   rK   r=   rL   )rO   block	grad_namer(   r(   r)   _append_clip_op  rQ   z!BaseErrorClipAttr._append_clip_opN)__name__
__module____qualname__rP   rT   r(   r(   r(   r)   rJ      s    rJ   c                   @   s*   e Zd ZdZd	ddZdd Zdd ZdS )
ErrorClipByValuea1  
    Clip tensor values to the range [min, max].

    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
        will be set to ``-max`` by framework.

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.enable_static()
            >>> BATCH_SIZE = 128
            >>> CLIP_MAX = 2e-6
            >>> CLIP_MIN = -1e-6
            >>> prog = paddle.static.Program()
            >>> with paddle.static.program_guard(main_program=prog):
            ...     image = paddle.static.data(name='x', shape=[None, 784], dtype='float32')
            ...     hidden1 = paddle.static.nn.fc(image, size=128, activation='relu')
            ...     hidden2 = paddle.static.nn.fc(hidden1, size=64, activation='relu')
            ...     predict = paddle.static.nn.fc(hidden2, size=10, activation='softmax')
            ...     label = paddle.static.data(name='y', shape=[1], dtype='int64')
            ...     cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
            ...     avg_cost = paddle.mean(cost)
            >>> prog_clip = prog.clone()
            >>> prog_clip.block(0).var(hidden1.name)._set_error_clip(
            ...     paddle.nn.clip.ErrorClipByValue(
            ...         max=CLIP_MAX, min=CLIP_MIN))
    Nc                 C   s0   t |}|d u r| }nt |}|| _|| _d S r=   )r"   maxminrO   rY   rZ   r(   r(   r)   __init__,  s   
zErrorClipByValue.__init__c                 C      d| j dd| jdS )NzByValue, min=f, max=rZ   rY   rN   r(   r(   r)   rP   5     zErrorClipByValue.__str__c                 C   sP   |j  }|d |d|g |d|g |d| j |d| j d S )Nclipr   r   rZ   rY   )descr$   set_typeZ	set_inputZ
set_output	_set_attrrZ   rY   )rO   rR   rS   Zclip_op_descr(   r(   r)   rT   8  s   

z ErrorClipByValue._append_clip_opr=   )rU   rV   rW   __doc__r\   rP   rT   r(   r(   r(   r)   rX     s
    
&	rX   c                    s   | | j | j  d } fdd| D D ]&}|  | }t|dd }|d u s4t|ts4td|d ur>|	| | qd S )Nr4   c                    s   g | ]}| v r|qS r(   r(   ).0nZgrad_to_varr(   r)   
<listcomp>E  s    z'error_clip_callback.<locals>.<listcomp>
error_clipzIVariable's error_clip should be an instance of BaseErrorClipAttr or None.)
rc   opZop_sizeZoutput_arg_namesZ_var_recursivegetattrr6   rJ   r3   rT   )rR   contextZop_descZgrad_nZfwd_varrk   r(   ri   r)   error_clip_callbackA  s   ro   c                       s\   e Zd Z fddZdd Ze dd Zdd Zd	d
 Z	dd Z
dd Zdd Z  ZS )ClipGradBasec                    s   t    d S r=   )superr\   rN   	__class__r(   r)   r\   S     zClipGradBase.__init__c                 C   rK   r=   rL   rN   r(   r(   r)   rP   V  rQ   zClipGradBase.__str__c                 C      t r=   rL   rO   params_gradsr(   r(   r)   _dygraph_clipY  s   zClipGradBase._dygraph_clipc                 C   ru   r=   rL   rv   r(   r(   r)   	_pir_clip]     zClipGradBase._pir_clipc                 C   ru   r=   rL   rv   r(   r(   r)   _static_clip`  rz   zClipGradBase._static_clipc                 C   sV   t  r| |S t r| |S |D ]\}}t|dd d ur%td  nq| |S )Ngradient_clip_attrz'set_gradient_clip' will be ineffective, because you have set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' is redundant and you can remove it.)r   rx   r   ry   rm   warningswarnr{   )rO   rw   pgr(   r(   r)   __call__c  s   


zClipGradBase.__call__c                 C   rK   r=   rL   rO   rn   paramgradr(   r(   r)   _process_contexts  rQ   zClipGradBase._process_contextc                 C   rK   r=   rL   )rO   r   r   r(   r(   r)   _create_operatorsv  rQ   zClipGradBase._create_operators)rU   rV   rW   r\   rP   imperative_baseno_gradrx   ry   r{   r   r   r   __classcell__r(   r(   rr   r)   rp   R  s    
rp   c                       sR   e Zd ZdZd fdd	Zdd Ze dd Zd	d
 Z	dd Z
dd Z  ZS )ClipGradByValuea  
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].

    - Any values less than min are set to ``min``.

    - Any values greater than max are set to ``max``.

    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    Note:
        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
            automatically. In this case, ``max`` must be greater than :math:`0`.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByValue(min=-1, max=1)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    Nc                    s<   t    |d u r|dksJ | }t|| _t|| _d S )Ng        )rq   r\   r"   rY   rZ   r[   rr   r(   r)   r\     s   

zClipGradByValue.__init__c                 C   r]   )NzClip Gradient By Value, min = r^   r_   r`   rN   r(   r(   r)   rP     ra   zClipGradByValue.__str__c                 C   sb   g }|D ]*\}}|d u rqt |dddu r|||f qtj|| j| jd}|||f q|S )N	need_clipTFr%   rZ   rY   )rm   appendpaddlerb   rZ   rY   rO   rw   params_and_gradsr   r   new_gradr(   r(   r)   rx     s   zClipGradByValue._dygraph_clipc              
   C   s   g }i }t dT |D ]I\}}|d u rqt|dddu r%|||f q|jj||g tj|| j	| j
d}W d    n1 sCw   Y  |||f |j||j< qW d    n1 s`w   Y  t|| |S )Ngradient_clipr   TFr   )r   
name_scoperm   r   rR   program_optimized_guardr   rb   rZ   rY   r   _correct_clip_op_role_varrO   rw   r   param_new_grad_name_dictr   r   r   r(   r(   r)   r{     s$   
zClipGradByValue._static_clipc                 C      d S r=   r(   r   r(   r(   r)   r     rz   z ClipGradByValue._process_contextc                 C   s   t j|| j| jd}||fS )Nr   )r   rb   rZ   rY   rO   r   r   r   r(   r(   r)   r     s   z!ClipGradByValue._create_operatorsr=   rU   rV   rW   rf   r\   rP   r   r   rx   r{   r   r   r   r(   r(   rr   r)   r   z  s    '
r   c                       sP   e Zd ZdZ fddZdd Ze dd Zdd	 Z	d
d Z
dd Z  ZS )ClipGradByNorma  
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .

    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.

    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.

    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::
        Out =
        \left\{
            \begin{array}{ccl}
                X & & if (norm(X) \leq clip\_norm) \\
                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
        \end{array}
        \right.


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

    .. math::
        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}

    Note:
        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

    Args:
        clip_norm(float): The maximum norm value.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    c                    s   t    t|| _d S r=   )rq   r\   r"   	clip_norm)rO   r   rr   r(   r)   r\     s   
zClipGradByNorm.__init__c                 C   
   d| j  S )Nz#Gradient Clip By Norm, clip_norm=%fr   rN   r(   r(   r)   rP        
zClipGradByNorm.__str__c                 C   s\   g }|D ]'\}}|d u rqt |dddu r|||f qt|| jd}|||f q|S )Nr   TFr%   r   )rm   r   r   r   r   r(   r(   r)   rx     s   zClipGradByNorm._dygraph_clipc              
   C   s   g }t dS i }|D ]F\}}|d u rqt|dddu r%|||f q|jj||g t|| jd}W d    n1 s@w   Y  |j	||j	< |||f qW d    n1 s]w   Y  t
|| |S )Nr   r   TFr   )r   r   rm   r   rR   r   r   r   r   r   r   r   r(   r(   r)   r{     s$   
zClipGradByNorm._static_clipc                 C   r   r=   r(   r   r(   r(   r)   r   1  rz   zClipGradByNorm._process_contextc                 C   s   t || jd}||fS )Nr   )r   r   r   r(   r(   r)   r   4  s   z ClipGradByNorm._create_operatorsr   r(   r(   rr   r)   r     s    6
r   c                  G   >   t | dkrtS t | dkrt| d tsJ t}| d a|S Nr   r4   )r5   &_allow_pure_fp16_global_norm_clip_flagr6   r7   r9   r(   r(   r)   !_allow_pure_fp16_global_norm_clip<     r   c                  G   r   r   )r5   &_allow_pure_bf16_global_norm_clip_flagr6   r7   r9   r(   r(   r)   !_allow_pure_bf16_global_norm_clipJ  r   r   c                       s\   e Zd ZdZ	d fdd	Zdd Ze dd	 Zd
d Z	dd Z
dd Zdd Z  ZS )ClipGradByGlobalNormao  
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
    :math:`t\_list` , and limit it to ``clip_norm`` .

    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.

    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.

    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::

        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

    Args:
        clip_norm (float): The maximum norm value.
        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
        auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    default_groupFc                    s8   t    t|| _|| _t|tsJ || _d | _d S r=   )	rq   r\   r"   r   
group_namer6   r7   auto_skip_clip_async_add_n)rO   r   r   r   rr   r(   r)   r\     s   


zClipGradByGlobalNorm.__init__c                 C   r   )Nz+Gradient Clip By GlobalNorm, global_norm=%fr   rN   r(   r(   r)   rP     r   zClipGradByGlobalNorm.__str__c                 C   s  g }g }g }g }|D ]a\}}|d u rq
t |dddu rq
|}t r.| r.t|}| }n|jtjjj	kr>t|}t
|}t|}	|	jtjjjksR|	jtjjjkrX||	 q
|	jtjjjkrf||	 q
||	 q
t|t| t| dkr||S dd }
t|dkrdnd}g }t|dkr|
|}||| t|dkr|
|}|dkr|| n||| t|dkr|
|}|| |
|}t|}tjg |j| jd	}d}| jsd}tj|tj||d
d
}n||krd}tj||d
}|D ]B\}}|d u r	qt |dddu r|||f q|r:|j|jkr*||jn|}t||}|||f q|||f q|S )Nr   TFr   c                 S      t |  S r=   r   stacksumZvar_listr(   r(   r)   async_add_n  rt   z7ClipGradByGlobalNorm._dygraph_clip.<locals>.async_add_nrG   r   shaper   Z
fill_valuer%   y)rm   r   Zis_selected_rowsr*   Z_get_tensor_from_selected_rowsr   r   r0   r1   r2   r.   rI   r   r>   r?   r   rA   r5   r@   r   sqrtfullr   r   dividemaximummultiplyrO   rw   r   sum_square_listsum_square_list_fp16sum_square_list_fp32r   r   
merge_grad
sum_squarer   	sum_dtypeglobal_norm_varglobal_norm_var_fp16global_norm_var_fp32Zglobal_norm_var_fp64max_global_normr   clip_varZ
clip_inputr   r(   r(   r)   rx     s   




z"ClipGradByGlobalNorm._dygraph_clipc                 C   sX  g }g }g }g }|D ]J\}}|d u rq
t |dddu rq
|}t r-| r-t|}t|}t|}	|	jtjks=|	jtj	krC|
|	 q
|	jtjkrO|
|	 q
|
|	 q
t|t| t| dkre|S dd }
t|dkrqdnd}g }t|dkr|
|}|
|| t|dkr|
|}|dkr|
| n|
|| t|dkr|
|}|
| |
|}t|}tjg |j| jd	}d}| jsd}tj|tj||d
d
}n||krd}tj||d
}|D ]A\}}|d u rqt |dddu r|
||f q|r"|j|jkr||jn|}t||}|
||f q|
||f q|S )Nr   TFr   c                 S   r   r=   r   r   r(   r(   r)   r      rt   z3ClipGradByGlobalNorm._pir_clip.<locals>.async_add_nrG   r   r   r   )rm   r   is_selected_row_typer*   r.   rI   r   r   rB   rC   r   ZFLOAT32r5   r@   r   r   r   r   r   r   r   r   r   r(   r(   r)   ry     s   


zClipGradByGlobalNorm._pir_clipc              
      s  g }g }g }g }g } fdd}t d |D ]p\}}	|	d u r"qt|dddu r+q|	}
|jj||	gK |	jtjj	j
krGt|	}
t|
}
t|
}|jtjj	jkrY|| n!|jtjj	jkrg|| n|jtjj	jkru|| n|| W d    n1 sw   Y  qt|dkrt|dkrtdt|t| t| dkrt|t| t| dkr|W  d    S |jj||	g t|dkrd	nd
}g }t|dkr||}|s|st s||| n|| t|dkr||}|s	|s	t s||| n|| t|dkr5||}|d
kr-|| n||| t|dkrE||}|| t|dkrP||n|d }tj|d}tjdg|j jd}tj|tj||dd}W d    n	1 s|w   Y  i }|D ]\}}	|	d u rqt|dddu r|||	f q|jj||	gg t|	}|jtjj	jkr|jtjj	jkr|d}n|jtjj	jkr|jtjj	jkr|d}n|}t   }|j!d||dd|id ||	ur|j!dd|id|	i|j|	jdd W d    n	1 sw   Y  |	j"||j"< |||	f qW d    n	1 s5w   Y  t#|| |S )Nc                    s    j r
t|  S t| S r=   )r   r   r   r   Zadd_nr   rN   r(   r)   _add_n^  s   
z1ClipGradByGlobalNorm._static_clip.<locals>._add_nr   r   TFr   z1FP16 and BF16 are not supported at the same time.rG   r   r4   rD   r   r   r   Zbfloat16elementwise_mulr   Yr   rH   castr   )Zin_dtypeZ	out_dtyper/   )$r   r   rm   rR   r   r   r   r   r0   r1   r2   r*   r.   rI   r   r>   r   r?   rA   r5   r   r   r@   r   r   r   r   r   r   r   rE   r   Zcurrent_blockr$   r   r   )rO   rw   r   r   r   Zsum_square_list_bf16r   r   r   r   r   r   r   r   r   Zglobal_norm_var_bf16r   Zglobal_norm_var_other_dtyper   Z	scale_varr   Znew_gZscale_inputrR   r(   rN   r)   r{   W  s  
(



8


$ 
z!ClipGradByGlobalNorm._static_clipc                 C   s   | j |vr#g || j < | j|| j d < tjdg|j| jd|| j d < n| j|| j d  ks1td|}|jtjj	j
krDt|}t|}nt rS| rSt|}t|}t|}|| j  | || _d S )NZ_clip_valuer4   r   _clipz>All parameters' 'clip_norm' of a same group should be the same)r   r   r   r   r   
ValueErrorr   r   r0   r1   r2   r*   r.   r   r   rI   r   rn   )rO   rn   r   r   r   Zlocal_norm_varr(   r(   r)   r     s(   



z%ClipGradByGlobalNorm._process_contextc                 C   s   dd }| j d }|| jvr<|| j| j  }tj|d}| j| j d  }tj|tj||dd}|jdks7J || j|< t rLt|| j| }||fS |j	j
d|| j| d	d
|id ||fS )Nc                 S   r   r=   r   r   r(   r(   r)   r     rt   z;ClipGradByGlobalNorm._create_operators.<locals>.async_add_nZ_scalerD   r   r   )r4   r   r   r   rH   )r   rn   r   r   r   r   r   r   r   rR   r$   )rO   r   r   r   Zgroup_scale_nameZgroup_norm_varr   Zgroup_scale_varr(   r(   r)   r     s*   


z&ClipGradByGlobalNorm._create_operators)r   F)rU   rV   rW   rf   r\   rP   r   r   rx   ry   r{   r   r   r   r(   r(   rr   r)   r   U  s    5
_[  r   c                    s   t d t| tstd du rt   djD ]}d|	 v r2d|
dv r2t d  nq|du r> d }tdd	 |D rP fd
d|D }tdd	 |D s]td|D ]}t| |_q_dS )a  
    Warning:

        This API must be used after building network, and before ``minimize`` ,
        and it may be removed in future releases, so it is not recommended.
        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
        this is a better method to clip gradient. There are three clipping strategies:
         :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
         :ref:`api_paddle_nn_ClipGradByValue` .

    To specify parameters that require gradient clip.

    Args:
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default value: None, and there is no
            gradient clipping.
        param_list (list(Variable), optional): Parameters that require gradient clip.
                It can be a list of parameter or a list of parameter's name.
                Default None, meaning that all parameters in the program will be included.
        program (Program, optional): The program where parameters are located.
                Default None, meaning that using :ref:`api_paddle_static_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.enable_static()

            >>> def network():
            ...     image = paddle.static.data(name='image', shape=[
            ...                        None, 28], dtype='float32')
            ...     param_attr1 = paddle.ParamAttr("fc1_param")
            ...     fc1 = paddle.static.nn.fc(image, size=10, weight_attr=param_attr1)
            ...     param_attr2 = paddle.ParamAttr("fc2_param")
            ...     fc2 = paddle.static.nn.fc(fc1, size=10, weight_attr=param_attr2)
            ...     loss = paddle.mean(fc2)
            ...     return loss


            >>> # network 1: clip all parameter gradient
            >>> with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 2: clip parameter gradient by name
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
            ...         param_list=["fc1_param", "fc2_param"])
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 3: clip parameter gradient by value
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     param_var1 = paddle.static.default_main_program().global_block().var("fc1_param")
            ...     param_var2 = paddle.static.default_main_program().global_block().var("fc2_param")
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
            ...         param_list=[param_var1, param_var2])
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
            ...     clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            ...     # Set the gradient clipping strategy: clip1
            ...     paddle.nn.clip.set_gradient_clip(clip1)
            ...     # Set the gradient clipping strategy: clip2
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
            ...     sgd.minimize(loss)
            ...     # 'set_gradient_clip' will not take effect when setting has a conflict,
            ...     # and the gradient clipping strategy will be 'clip2'


    zCaution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documention of 'optimizer'.z<'clip' should be an instance of ClipGradBase's derived classNr   op_namescopeZ	optimizerz'minimize' has been invoked before, this will make 'set_gradient_clip' be ineffective! Please invoke 'set_gradient_clip' before 'minimize'.c                 s   s    | ]}t |tV  qd S r=   )r6   strrg   elemr(   r(   r)   	<genexpr>  s    z$set_gradient_clip.<locals>.<genexpr>c                    s   g | ]
}  d |qS )r   )rR   varr   r   r(   r)   rj     s    z%set_gradient_clip.<locals>.<listcomp>c                 s   s    | ]	}t |tjV  qd S r=   )r6   r   	Parameterr   r(   r(   r)   r     s    zK'param_list' should be a list of Parameter or basestring(parameter's name).)r}   r~   r6   rp   r3   r   r   rR   opsZ	all_attrsattrZall_parametersallcopydeepcopyr|   )rb   Z
param_listr   rl   r   r(   r   r)   set_gradient_clip.  s:   Z
	r   c           	   
   C   s  i }| D ]d\}}|d u rq|j j||gJ td5 t|dd }|d u r9| W  d    W  d      S t|tsBtd|j	|||d W d    n1 sTw   Y  W d    n1 scw   Y  qg }i }| D ]M\}}|d u rxqo|j j||g3 td |j
||d\}}|j||j< |||g W d    n1 sw   Y  W d    n1 sw   Y  qot|| |S )Nr   r|   z8clip attribute should be an instance of GradientClipBase)rn   r   r   )r   r   )rR   r   r   r   r   rm   r6   rp   r3   r   r   r   r   r   )	Zparam_gradsrn   r   r   Z	clip_attrresr   r   r   r(   r(   r)   append_gradient_clip_ops  sJ   "
  
r   c           	      C   s   g }t |dkr
d S | D ]I\}}|d u rq|jj}||v rq|| |jj jD ]*}|drTd|dv rT|drT|dd }||v rT||| g}|	d| q*qd S )Nr   r   r   Zop_role_var)
r5   rR   idxr   r   Zglobal_blockr   Zhas_attrr   re   )	rw   r   Zblock_id_listr   r   Zblock_idrl   
param_nameZcorrect_p_gr(   r(   r)   r     s2   
r   r=   )NN)7r   r}   sqlite3r   r   Zpaddle.autogradZautogradr   r   Zpaddle.baser   r   r   Zpaddle.base.data_feederr   Zpaddle.base.libpaddler   Zpaddle.common_ops_importr	   r
   r   Zpaddle.frameworkr   r   r   r   Z&paddle.tensor.layer_function_generatorr   __all__r   r*   r.   r8   r<   rE   rI   rJ   rX   ro   rp   r   r   r   r   r   r   r   Zdygraph_not_supportr   r   r   ZGradientClipBaseZGradientClipByValueZGradientClipByNormZGradientClipByGlobalNormr(   r(   r(   r)   <module>   s\   <&?<(Ze   \ %