o
     j                     @   s   d dl Z d dlmZ d dlmZ d dlZd dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZ G dd deZdd ZG dd dZG dd deZdS )    N)defaultdict)Enum)_C_ops_legacy_C_ops)core)
check_type)to_variable)_dygraph_tracerdygraph_only)in_dynamic_mode   )amp_global_statec                   @   s   e Zd ZdZdZdZdS )OptimizerStater   r      N)__name__
__module____qualname__INITUNSCALEDSTEPPED r   r   W/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/amp/grad_scaler.pyr      s    r   c                   C   s
   dt jiS )Nstate)r   r   r   r   r   r   _refresh_optimizer_state%   s   
r   c                   @   s   e Zd ZdZe							d/dd	Zd
d Zdd Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.S )0	AmpScalera	  
    AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative
    mode. It controls the scaling of loss, helps avoiding numerical overflow.
    The object of this class has seventeen methods `scale()`, `unscale_()`, `minimize()` and `get`/`set` api of parameters.

    `scale()` is used to multiply the loss by a scale ratio.
    `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling.

    Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in
    imperative mode.

    Args:
        enable(bool, optional): Enable loss scaling or not. Default is True.
        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
        incr_ratio(float, optional): The multiplier to use when increasing the loss
                        scaling. Default is 2.0.
        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                        the loss scaling. Default is 0.5.
        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
                                steps with finite gradients. Default is 1000.
        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                    accumulated steps with nan or inf gradients. Default is 2.
        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
    Returns:
        An AmpScaler object.

    Examples:

        .. code-block:: python

            >>> import numpy as np
            >>> import paddle

            >>> data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
            >>> model = paddle.nn.Conv2D(3, 2, 3)
            >>> optimizer = paddle.optimizer.SGD(
            ...         learning_rate=0.01, parameters=model.parameters())
            >>> scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
            >>> data = paddle.to_tensor(data)
            >>> with paddle.amp.amp_guard():
            ...     conv = model(data)
            ...     loss = paddle.mean(conv)
            ...     scaled = scaler.scale(loss)
            ...     scaled.backward()
            ...     scaler.minimize(optimizer, scaled)
    T      @       @      ?  r   c           	      C   sZ  t  }|s	td|r$|j s$|j s$|j s$td|j  d}|| _| jr|dks2J d|dk s:J d|| _	|| _
|| _|| _|| _d| _d| _|| _ttdgtj| _ttdgtj| _ttdgtj| _ttdgtj| _ttdgtj| _tt| j	gtj| _d | _tt| _ d S d S )Nz;current_tracer is None, maybe it is not in imperative mode.zqAmpScaler can only be enabled on CUDAPlace, XPUPlace and CustomPlace, current place is %s, so it makes no effect.F      ?zThe incr_ratio must be > 1.0.zThe decr_ratio must be < 1.0.r   )!r	   
ValueErrorZ_expected_placeZis_gpu_placeZis_xpu_placeZis_custom_placewarningswarn_enable_init_loss_scaling_incr_ratio_decr_ratio_incr_every_n_steps_decr_every_n_nan_or_inf_incr_count_decr_count_use_dynamic_loss_scalingr   nparrayastypeZbool_
_found_inf_temp_found_inf_value_false_temp_found_inf_fp16_temp_found_inf_bf16_temp_found_inf_fp32float32_scale_cache_founf_infr   r   _optimizer_states)	selfenableinit_loss_scaling
incr_ratio
decr_ratioincr_every_n_stepsdecr_every_n_nan_or_infuse_dynamic_loss_scalingZtracerr   r   r   __init__Z   sb   zAmpScaler.__init__c                 C   s\   t |dtjjd | jr$t jdkr$| jr$d| _d| _t	dt j  | js)|S || j
 S )al  
        Multiplies a Tensor by the scale factor and returns scaled outputs.
        If this instance of :class:`AmpScaler` is not enabled, output are returned unmodified.

        Args:
            var (Tensor):  The Tensor to scale.
        Returns:
            The scaled Tensor or original Tensor.

        Examples:

            .. code-block:: python

                >>> import numpy as np
                >>> import paddle

                >>> data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
                >>> model = paddle.nn.Conv2D(3, 2, 3)
                >>> optimizer = paddle.optimizer.SGD(
                ...         learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
                >>> data = paddle.to_tensor(data)
                >>> with paddle.amp.amp_guard():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                ...     scaled = scaler.scale(loss)
                ...     scaled.backward()
                ...     scaler.minimize(optimizer, scaled)
        varzAmpScaler.scale()Zfloat16Fz^It is not recommended to use dynamic loss scaling for %s, so GradScaler is disable by default.)r   r   eagerZTensorr#   r   Z	amp_dtyper+   r!   r"   r5   r8   rA   r   r   r   scale   s    
zAmpScaler.scalec                 O   s   | j s|j|i |S | jt| }|d tju r| | d\}}t|dr?|d| j	 |j|i |\}}|
d| _n| j	rFd| _n|j|i |\}}d| _| jrZ|   tt| _||fS )a  
        This function is similar as `Optimizer.minimize()`, which performs parameters updating.

        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Finally, the loss scaling ratio is updated.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
            args:  Arguments, which will be forward to `optimizer.minimize()`.
            kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.

        Examples:

            .. code-block:: python

                >>> import numpy as np
                >>> import paddle

                >>> data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
                >>> model = paddle.nn.Conv2D(3, 2, 3)
                >>> optimizer = paddle.optimizer.SGD(
                ...     learning_rate=0.01,
                ...     parameters=model.parameters()
                ... )
                >>> scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
                >>> data = paddle.to_tensor(data)
                >>> with paddle.amp.amp_guard():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                ...     scaled = scaler.scale(loss)
                ...     scaled.backward()
                ...     scaler.minimize(optimizer, scaled)
        r   )NN_set_auxiliary_var	found_infTF)r#   minimizer7   idr   r   _unscalehasattrrE   r/   _get_auxiliary_varr6   r+   _updater   r   )r8   	optimizerargskwargsoptimizer_stateZoptimize_opsZparams_gradsr   r   r   rG      s$   $


zAmpScaler.minimizec           	      C   s  | j sdS | jt| }|d tju rtd|d tju r"tdt|ddr~t|j	d t
r~g }g }g }g }|j	D ]A}|d D ]:}| dur{||  | jtjjjkrb||  qA| jtjjjkrt||  qA||  qAq;n+t rtj|j\}}}ndd	 |jD }d
d	 |D }dd	 |D }dd	 |D }| j| _t|rt|| j|| j t| j| j| _t|rt|| j|| j  t| j| j | _t|rt|| j|| j! t| j| j!| _tj|d< dS )a  
        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
        Returns:
            The unscaled parameters or original parameters.
        Nr   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step()._param_groupsr   paramsc                 S   s    g | ]}|  d ur|  qS N)
_grad_ivar.0paramr   r   r   
<listcomp>C  s
    z&AmpScaler._unscale.<locals>.<listcomp>c                 S       g | ]}|j tjjjkr|qS r   )dtyper   VarDescVarTypeFP16rU   r   r   r   rX   H  
    c                 S   rY   r   )rZ   r   r[   r\   BF16rU   r   r   r   rX   M  r^   c                 S   rY   r   )rZ   r   r[   r\   ZFP32rU   r   r   r   rX   R  r^   )"r#   r7   rH   r   r   RuntimeErrorr   getattr
isinstancerQ   dictrT   appendrZ   r   r[   r\   r]   r_   r   rB   Zget_grads_listsZ_parameter_listr0   r/   lenr   Zcheck_finite_and_unscaler5   r1   r   Z
bitwise_orr2   r3   )	r8   rM   rP   Zparam_gradsZparam_grads_fp16Zparam_grads_bf16Zparam_grads_fp32grouprW   r   r   r   rI     s   	

zAmpScaler._unscalec                 C   s   | j sdS | jr5d| _| jd | _| j| jkr3tdt| jt| jt| j	 | j| j	 | _d| _dS d| _| jd | _| j| j
krN| j| j | _d| _dS )z+
        Updates the loss_scaling.
        Nr   r   z:Found inf or nan, current scale is: {}, decrease to: {}*{})r#   r6   r)   r*   r(   printformatfloatr5   r&   r'   r%   r8   r   r   r   rL   y  s,   zAmpScaler._updatec                 C      | j S )z
        Enable loss scaling or not.

        Returns:
            bool: enable loss scaling return True else return False.
        )r#   rj   r   r   r   	is_enable     zAmpScaler.is_enablec                 C   rk   )z
        Whether to use dynamic loss scaling.

        Returns:
            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.
        )r+   rj   r   r   r   is_use_dynamic_loss_scaling  rm   z%AmpScaler.is_use_dynamic_loss_scalingc                 C   rk   )z
        Return the initial loss scaling factor.

        Reurns:
            float:  the initial loss scaling factor.
        )r$   rj   r   r   r   get_init_loss_scaling  rm   zAmpScaler.get_init_loss_scalingc                 C   s&   || _ tt| j gtj| _dS )z
        Set the initial loss scaling factor by `new_init_loss_scaling`.

        Args:
            new_init_loss_scaling(int):  The new_init_loss_scaling used to update initial loss scaling factor.s
        N)r$   r   r,   r-   r.   r4   r5   r8   Znew_init_loss_scalingr   r   r   set_init_loss_scaling  s   
zAmpScaler.set_init_loss_scalingc                 C   rk   )z
        Return the multiplier to use when increasing the loss scaling.

        Reurns:
            float:  the multiplier to use when increasing the loss scaling.
        r%   rj   r   r   r   get_incr_ratio  rm   zAmpScaler.get_incr_ratioc                 C   s   |dksJ d|| _ dS )a  
        Set the multiplier to use when increasing the loss scaling by `new_incr_ratio`, `new_incr_ratio` should > 1.0.

        Args:
            new_incr_ratio(float):  The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.
        r   z!The new_incr_ratio must be > 1.0.Nrr   r8   Znew_incr_ratior   r   r   set_incr_ratio     
zAmpScaler.set_incr_ratioc                 C   rk   )z
        Get the less-than-one-multiplier to use when decreasing the loss scaling.

        Reurns:
            float:  the less-than-one-multiplier to use when decreasing the loss scaling.
        r&   rj   r   r   r   get_decr_ratio  rm   zAmpScaler.get_decr_ratioc                 C   s   |dk sJ d|| _ dS )a)  
        Set the less-than-one-multiplier to use when decreasing the loss scaling by `new_incr_ratio`, `new_decr_ratio` should < 1.0.

        Args:
            new_decr_ratio(float):  The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.
        r   z!The new_decr_ratio must be < 1.0.Nrw   r8   Znew_decr_ratior   r   r   set_decr_ratio  rv   zAmpScaler.set_decr_ratioc                 C   rk   )a  
        Return the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Reurns:
            int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
        r'   rj   r   r   r   get_incr_every_n_steps  rm   z AmpScaler.get_incr_every_n_stepsc                 C   
   || _ dS )a^  
        Set the num `n` by `new_incr_every_n_steps`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Args:
            new_incr_every_n_steps(int):  The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
        Nr{   r8   Znew_incr_every_n_stepsr   r   r   set_incr_every_n_steps     
z AmpScaler.set_incr_every_n_stepsc                 C   rk   )a  
        Return the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Reurns:
            int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
        r(   rj   r   r   r   get_decr_every_n_nan_or_inf  rm   z%AmpScaler.get_decr_every_n_nan_or_infc                 C   r}   )au  
        Set the num `n` by `new_decr_every_n_nan_or_inf`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Args:
            new_decr_every_n_nan_or_inf(int):  The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
        Nr   r8   Znew_decr_every_n_nan_or_infr   r   r   set_decr_every_n_nan_or_inf  r   z%AmpScaler.set_decr_every_n_nan_or_infc              	   C   s4   | j r| j | j| j| j| j| j| j| j	dS i S )a  
        Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict.

        Reurns:
            A dict of scaler includes:
            scale (tensor): The loss scaling factor.
            incr_ratio(float): The multiplier to use when increasing the loss scaling.
            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling.
            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients.
            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients.
            incr_count(int): The number of recent consecutive unskipped steps.
            decr_count(int): The number of recent consecutive skipped steps.
            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
        )rD   r;   r<   r=   r>   
incr_count
decr_countr?   )
r#   r5   numpyr%   r&   r'   r(   r)   r*   r+   rj   r   r   r   
state_dict  s   zAmpScaler.state_dictc                 C   s   | j sdS t|dkrtd|d d | _tt| jgtj| _	|d | _
|d | _|d | _|d | _|d	 | _|d
 | _|d | _dS )z
        Loads the scaler state.

        Args:
           state_dict(dict): scaler state.  Should be an object returned from a call to `AmpScaler.state_dict()`.
        Nr   zdThe input state dict is empty, possibly because it was saved from a disabled instance of GradScaler.rD   r;   r<   r=   r>   r   r   r?   )r#   re   r`   r$   r   r,   r-   r.   r4   r5   r%   r&   r'   r(   r)   r*   r+   r8   r   r   r   r   load_state_dict%  s"   





zAmpScaler.load_state_dictN)Tr   r   r   r   r   T)r   r   r   __doc__r
   r@   rD   rG   rI   rL   rl   rn   ro   rq   rs   ru   rx   rz   r|   r   r   r   r   r   r   r   r   r   r   )   s:    0=1Bn				
	
				r   c                       s  e Zd ZdZ							d0 fdd		Z fd
dZ fddZdd Zdd Z fddZ	 fddZ
 fddZ fddZ fddZ fddZ fddZ fd d!Z fd"d#Z fd$d%Z fd&d'Z fd(d)Z fd*d+Z fd,d-Z fd.d/Z  ZS )1
GradScalerak
  
    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
    It controls the scaling of loss, helps avoiding numerical overflow.
    The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.

    `scale()` is used to multiply the loss by a scale ratio.
    `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`.
    `step()` is similar as `optimizer.step()`, which performs parameters updating.
    `update` is used to update the loss_scaling.


    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
    dynamic graph mode.

    Args:
        enable(bool, optional): Enable loss scaling or not. Default is True.
        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 65536.0.
        incr_ratio(float, optional): The multiplier to use when increasing the loss
                        scaling. Default is 2.0.
        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                        the loss scaling. Default is 0.5.
        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
                                steps with finite gradients. Default is 2000.
        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                    accumulated steps with nan or inf gradients. Default is 1.
        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
    Returns:
        An GradScaler object.

    Examples:

        .. code-block:: python

            >>> import paddle

            >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
            >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.auto_cast():
            ...     conv = model(data)
            ...     loss = paddle.mean(conv)

            >>> scaled = scaler.scale(loss)  # scale the loss
            >>> scaled.backward()            # do backward
            >>> scaler.minimize(optimizer, scaled)  # update parameters
            >>> optimizer.clear_grad()
    T      @r   r     r   c              	      s   t  ||||||| d S rS   )superr@   )r8   r9   r:   r;   r<   r=   r>   r?   	__class__r   r   r@   v  s   
zGradScaler.__init__c                       t  |S )aJ  
        Multiplies a Tensor by the scale factor and returns scaled outputs.
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
            var (Tensor):  The tensor to scale.
        Returns:
            The scaled tensor or original tensor.

        Examples:

            .. code-block:: python

                >>> import paddle

                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])

                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)

                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.minimize(optimizer, scaled)  # update parameters
                >>> optimizer.clear_grad()
        )r   rD   rC   r   r   r   rD     s   zGradScaler.scalec                    s   t  j|g|R i |S )a  
        This function is similar as `optimizer.minimize()`, which performs parameters updating.

        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Finally, the loss scaling ratio is updated.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
            args:  Arguments, which will be forward to `optimizer.minimize()`.
            kwargs: Keyword arguments, which will be forward to `optimizer.minimize()`.

        Examples:

            .. code-block:: python

                >>> import paddle

                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])

                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)

                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.minimize(optimizer, scaled)  # update parameters
                >>> optimizer.clear_grad()
        )r   rG   )r8   rM   rN   rO   r   r   r   rG     s   "zGradScaler.minimizec                 C   s   | j s| S | jt| }|d tju rtd|d tju r%| | t	|dr<|
d| j |  |d| _n| jrCd| _n|  d| _tj|d< | jsYtt| _dS dS )at  
        This function is similar as `optimizer.step()`, which performs parameters updating.

        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU)
                >>> import paddle
                >>> paddle.device.set_device('gpu')

                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])
                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.step(optimizer)       # update parameters
                >>> scaler.update()              # update the loss scaling ratio
                >>> optimizer.clear_grad()
        r   z7step() has already been called since the last update().rE   rF   TFN)r#   stepr7   rH   r   r   r`   r   rI   rJ   rE   r/   rK   r6   r+   r   r   )r8   rM   rP   r   r   r   r     s*   


zGradScaler.stepc                 C   s&   | j sdS | jr|   tt| _dS )a  
        Updates the loss_scaling.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU)
                >>> import paddle

                >>> paddle.device.set_device('gpu')
                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])
                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                >>> scaled = scaler.scale(loss)     # scale the loss
                >>> scaled.backward()               # do backward
                >>> scaler.step(optimizer)          # update parameters
                >>> scaler.update()                 # update the loss scaling ratio
                >>> optimizer.clear_grad()
        N)r#   r+   rL   r   r   r7   rj   r   r   r   update
  s   
zGradScaler.updatec                    r   )aE  
        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.

        Returns:
            The unscaled parameters or original parameters.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU)
                >>> import paddle

                >>> paddle.device.set_device('gpu')
                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])
                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.unscale_(optimizer)    # unscale the parameter
                >>> scaler.step(optimizer)
                >>> scaler.update()
                >>> optimizer.clear_grad()
        )r   rI   )r8   rM   r   r   r   unscale_*  s   !zGradScaler.unscale_c                    
   t   S )a  
        Enable loss scaling or not.

        Returns:
            bool: enable loss scaling return True else return False.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> enable = scaler.is_enable()
                >>> print(enable)
                True
        )r   rl   rj   r   r   r   rl   M     
zGradScaler.is_enablec                    r   )av  
        Whether to use dynamic loss scaling.

        Returns:
            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> use_dynamic_loss_scaling = scaler.is_use_dynamic_loss_scaling()
                >>> print(use_dynamic_loss_scaling)
                True
        )r   rn   rj   r   r   r   rn   h  r   z&GradScaler.is_use_dynamic_loss_scalingc                    r   )a%  
        Return the initial loss scaling factor.

        Reurns:
            float:  the initial loss scaling factor.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> init_loss_scaling = scaler.get_init_loss_scaling()
                >>> print(init_loss_scaling)
                1024
        )r   ro   rj   r   r   r   ro     r   z GradScaler.get_init_loss_scalingc                       t  | dS )a  
        Set the initial loss scaling factor by `new_init_loss_scaling`.

        Args:
            new_init_loss_scaling(float):  The new_init_loss_scaling used to update initial loss scaling factor.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_init_loss_scaling())
                1024
                >>> new_init_loss_scaling = 1000
                >>> scaler.set_init_loss_scaling(new_init_loss_scaling)
                >>> print(scaler.get_init_loss_scaling())
                1000
        N)r   rq   rp   r   r   r   rq        z GradScaler.set_init_loss_scalingc                    r   )a=  
        Return the multiplier to use when increasing the loss scaling.

        Reurns:
            float:  the multiplier to use when increasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> incr_ratio = scaler.get_incr_ratio()
                >>> print(incr_ratio)
                2.0
        )r   rs   rj   r   r   r   rs     r   zGradScaler.get_incr_ratioc                    r   )a  
        Set the multiplier to use when increasing the loss scaling by `new_incr_ratio`, `new_incr_ratio` should > 1.0.

        Args:
            new_incr_ratio(float):  The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_incr_ratio())
                2.0
                >>> new_incr_ratio = 3.0
                >>> scaler.set_incr_ratio(new_incr_ratio)
                >>> print(scaler.get_incr_ratio())
                3.0
        N)r   ru   rt   r   r   r   ru     r   zGradScaler.set_incr_ratioc                    r   )aV  
        Get the less-than-one-multiplier to use when decreasing the loss scaling.

        Reurns:
            float:  the less-than-one-multiplier to use when decreasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> decr_ratio = scaler.get_decr_ratio()
                >>> print(decr_ratio)
                0.5
        )r   rx   rj   r   r   r   rx     r   zGradScaler.get_decr_ratioc                    r   )a7  
        Set the less-than-one-multiplier to use when decreasing the loss scaling by `new_incr_ratio`, `new_decr_ratio` should < 1.0.

        Args:
            new_decr_ratio(float):  The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_decr_ratio())
                0.5
                >>> new_decr_ratio = 0.1
                >>> scaler.set_decr_ratio(new_decr_ratio)
                >>> print(scaler.get_decr_ratio())
                0.1
        N)r   rz   ry   r   r   r   rz     r   zGradScaler.set_decr_ratioc                    r   )a  
        Return the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Reurns:
            int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> incr_every_n_steps = scaler.get_incr_every_n_steps()
                >>> print(incr_every_n_steps)
                1000
        )r   r|   rj   r   r   r   r|   .  r   z!GradScaler.get_incr_every_n_stepsc                    r   )a  
        Set the num `n` by `new_incr_every_n_steps`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Args:
            new_incr_every_n_steps(int):  The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_incr_every_n_steps())
                1000
                >>> new_incr_every_n_steps = 2000
                >>> scaler.set_incr_every_n_steps(new_incr_every_n_steps)
                >>> print(scaler.get_incr_every_n_steps())
                2000
        N)r   r   r~   r   r   r   r   I  r   z!GradScaler.set_incr_every_n_stepsc                    r   )a  
        Return the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Reurns:
            int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> decr_every_n_nan_or_inf = scaler.get_decr_every_n_nan_or_inf()
                >>> print(decr_every_n_nan_or_inf)
                2
        )r   r   rj   r   r   r   r   g  r   z&GradScaler.get_decr_every_n_nan_or_infc                    r   )a  
        Set the num `n` by `new_decr_every_n_nan_or_inf`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Args:
            new_decr_every_n_nan_or_inf(int):  The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_decr_every_n_nan_or_inf())
                2
                >>> new_decr_every_n_nan_or_inf = 3
                >>> scaler.set_decr_every_n_nan_or_inf(new_decr_every_n_nan_or_inf)
                >>> print(scaler.get_decr_every_n_nan_or_inf())
                3
        N)r   r   r   r   r   r   r     r   z&GradScaler.set_decr_every_n_nan_or_infc                    r   )a.  
        Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict.

        Returns:
            A dict of scaler includes:
            scale (tensor): The loss scaling factor.
            incr_ratio(float): The multiplier to use when increasing the loss scaling.
            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling.
            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients.
            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients.
            incr_count(int): The number of recent consecutive unskipped steps.
            decr_count(int): The number of recent consecutive skipped steps.
            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.


        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle

                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> scaler_state = scaler.state_dict()
        )r   r   rj   r   r   r   r     s   
"zGradScaler.state_dictc                    r   )a;  
        Loads the scaler state.

        Args:
            state_dict(dict): scaler state.  Should be an object returned from a call to `GradScaler.state_dict()`.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle

                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> scaler_state = scaler.state_dict()
                >>> scaler.load_state_dict(scaler_state)
        N)r   r   r   r   r   r   r     s   zGradScaler.load_state_dict)Tr   r   r   r   r   T)r   r   r   r   r@   rD   rG   r   r   r   rl   rn   ro   rq   rs   ru   rx   rz   r|   r   r   r   r   r   __classcell__r   r   r   r   r   B  s:    5 $< #$r   )r!   collectionsr   enumr   r   r,   Zpaddler   r   Zpaddle.baser   Zpaddle.base.data_feederr   Zpaddle.base.dygraphr   Zpaddle.base.frameworkr	   r
   Zpaddle.frameworkr   Z	auto_castr   r   r   r   r   r   r   r   r   <module>   s&       