o
    "j"                     @   sL   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ G dd deZ	dS )    N)_legacy_C_ops)	framework)in_dygraph_mode)	Optimizerc                       sJ   e Zd ZdZdZ										d fd	d
	Zdd Zdd Z  ZS )LarsMomentumOptimizera  
    Momentum optimizer with LARS support

    The update equations are as follows:

    .. math::

        & local\_learning\_rate = learning\_rate * lars\_coeff * \\
          \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}

        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)

        & param = param - velocity

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element. \
            momentum (float): momentum factor
        lars_coeff (float): Defines how much we trust the layer to change its weights.
        lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_paddle_regularizer_L1Decay` , :ref:`api_paddle_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_paddle_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.
        exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
        epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
            before updating. Often choose to be `1.0/batch_size`.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import numpy as np

            >>> paddle.enable_static()
            >>> np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
            >>> inp = paddle.static.data(
            ...     name="inp", shape=[2, 2], dtype='float32')
            >>> out = paddle.static.nn.fc(inp, size=3)
            >>> out = paddle.sum(out)
            >>> optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
            >>> optimizer.minimize(out)

            >>> exe = paddle.static.Executor(paddle.CPUPlace())
            >>> exe.run(paddle.static.default_startup_program())
            >>> exe.run(
            ...     feed={"inp": np_inp},
            ...     fetch_list=[out.name])
    velocityMbP?Mb@?Nr   F      ?c                    s   |d usJ |d usJ t  j|||||d d| _|| _t|| _t|| _t|
| _|	d u r4g | _n|	| _|| _	t|| _
i | _d S )N)learning_rate
parametersZweight_decay	grad_clipnamelars_momentum)super__init__type	_momentumfloat_lars_coeff_lars_weight_decay_epsilon_exclude_from_weight_decay_multi_precision_rescale_grad_master_weights)selfr   Zmomentum
lars_coefflars_weight_decayZparameter_listZregularizationr   r   Zexclude_from_weight_decayepsilonmulti_precisionrescale_grad	__class__ h/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/incubate/optimizer/lars_momentum.pyr   V   s(   




zLarsMomentumOptimizer.__init__c                 C   st   t |tjsJ |D ]-}| jr"| |jr"| |}| | j| q
| |jr0| js0t	
d | | j| q
d S )NzAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Lars optimizer.)
isinstancer   Blockr   _is_dtype_fp16_or_bf16dtypeZ_create_master_weightZ_add_accumulator_velocity_acc_strwarningswarn)r   blockr   pZmaster_pr$   r$   r%   _create_accumulators{   s   

z*LarsMomentumOptimizer._create_accumulatorsc                 C   s^  t |tjsJ | j}|d j}t| jdkr%| jD ]
}||v r$d} nq| | j|d }| 	|}| j
o=| |d j}|rH| j|d j nd }	| j| j|g|| j| jd}
|d |d ||d}|d |d}|rt|	|d< |	|d< t rt|d g|d g|g|g|d g|gd	| jd
| jd|gd|d| jd| j\}}d S |j| j|||
dd}|S )Nr   g        )mur   r   r    r   r!      )ParamZGradVelocityZLearningRate)ZParamOutZVelocityOutZMasterParamZMasterParamOutr0   r   r   r    r   r!   T)r   inputsoutputsattrsZstop_gradient)r&   r   r'   r   r   lenr   Z_get_accumulator_masterr*   Z_create_param_lrr   r(   r)   r   r   r   r   r   r   r   r   Z	append_opr   )r   r-   Zparam_and_gradr   
param_namer   Zvelocity_acclrZfind_masterZmaster_weightr6   r4   r5   tmpZtmp2Zmomentum_opr$   r$   r%   _append_optimize_op   s   





z)LarsMomentumOptimizer._append_optimize_op)
r   r	   NNNNNr   Fr
   )	__name__
__module____qualname____doc__r*   r   r/   r;   __classcell__r$   r$   r"   r%   r      s     =%r   )
r+   Zpaddler   Zpaddle.baser   Zpaddle.base.frameworkr   Zpaddle.optimizerr   r   r$   r$   r$   r%   <module>   s   