o
    #j^                     @   sh   d dl Z d dlZd dlmZ d dlmZ d dlmZ ddlmZm	Z	 ddl
mZ g ZG d	d
 d
eZdS )    N)_C_ops)in_dynamic_or_pir_mode)L2Decay   )core	framework   )	Optimizerc                       sx   e Zd ZdZdZ										d fdd		Zd
d Zdd Zd fdd	Zdd Z	dd Z
dd Zdd Z  ZS )Momentuma  

    Simple Momentum optimizer with velocity state

    This optimizer has a flag for Nestrov Momentum.

    The update equations are as follows:

    .. math::

        & velocity = mu * velocity + gradient

        & if (use\_nesterov):

        &\quad   param = param - (gradient + mu * velocity) * learning\_rate

        & else:

        &\quad   param = param - learning\_rate * velocity

    Parameters:

        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
        momentum (float): Momentum factor. The default value is 0.9.
        parameters (list|tuple, optional): List|Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
            It can be a float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
            Often choose to be ``1.0/batch_size``.
        use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
        name (str, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> inp = paddle.to_tensor(inp)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
            >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
            >>> momentum = paddle.optimizer.Momentum(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
            >>> back = out.backward()
            >>> momentum.step()
            >>> momentum.clear_grad()

            >>> # Note that the learning_rate of linear_2 is 0.01.
            >>> linear_1 = paddle.nn.Linear(10, 10)
            >>> linear_2 = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear_1(inp)
            >>> out = linear_2(out)
            >>> loss = paddle.mean(out)
            >>> momentum = paddle.optimizer.Momentum(
            ...     learning_rate=0.1,
            ...     parameters=[{
            ...         'params': linear_1.parameters()
            ...     }, {
            ...         'params': linear_2.parameters(),
            ...         'weight_decay': 0.001,
            ...         'learning_rate': 0.1
            ...     }],
            ...     weight_decay=0.01,
            ...     momentum=0.9
            ... )
            >>> out.backward()
            >>> momentum.step()
            >>> momentum.clear_grad()

    velocityMbP??NF      ?c                    sZ  |d u rt d|d u rt ddd }t|trJt|d trJ|D ]'}d|v r,|d n|}| |\}}||d< ||d< ||rCd n|}||d< q"||rPd n|}t j|||||
d	 d
| _|| _t	|| _
| |\| _| _|| _|| _i | _|||| j| jd| _|	| _| jr|  | _|  | _|  | _d | jd< |  | _|  | _d S d S )Nzlearning_rate is not setzmomentum is not setc                 S   s   t | ttfS N)
isinstancer   float)Zregular r   Z/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/optimizer/momentum.py<lambda>   s    z#Momentum.__init__.<locals>.<lambda>r   weight_decayregularization_methodregularization_coeff)learning_rate
parametersr   	grad_clipnamemomentum)r   use_nesterovrescale_gradr   r   FP32_LODTensor)
ValueErrorr   listdict_update_regularizationsuper__init__type	_momentumbool_use_nesterov_regularization_method_regularization_coeff_multi_precision_rescale_grad_master_weights_default_dictZ_use_multi_tensorZ_create_multi_tensor_dict_param_dict_velocity_dict_master_weight_dict_regularization_method_dict_regularization_coeff_dict)selfr   r   r   r   r   r   multi_precisionr   Zuse_multi_tensorr   	predicateZparam_groupZdecay
reg_method	reg_coeffZ
py_regular	__class__r   r   r%   z   sd   







zMomentum.__init__c                 C   s6   d}d}t |trd}|j}t |trd}|}||fS )N         l2_decay)r   r   _coeffr   )r5   r   r8   r9   r   r   r   r#      s   

zMomentum._update_regularizationc                 C   s   t |tjtjjfsJ t |tr| |}|D ]B}|j| jv r!q| j	r>| 
|jr>| |}| | j| | j|j q| 
|jrL| j	sLtd | | j| | j|j qdS )zD
        if framework.in_dynamic_mode():
            return
        zAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Momentum optimizer.N)r   r   Blockpaddlepirr"   _update_param_groupr   Z_already_create_accumulaterr,   _is_dtype_fp16_or_bf16dtypeZ_create_master_weightZ_add_accumulator_velocity_acc_straddwarningswarn)r5   blockr   pZmaster_pr   r   r   _create_accumulators   s*   



zMomentum._create_accumulatorsc                    s*   t |drt|jtr|S t |||S )zpCreate and add backward regularization Operators

        Function helper of append_regularization_ops.
        regularizer)hasattrr   rM   r   r$   _create_regularization_of_grad)r5   paramZgradZregularizationr:   r   r   rO      s   z'Momentum._create_regularization_of_gradc                 C   sx  t |tjsJ t |tr| |}| | j|d }| |}|d }| j}| j	}t
|drEt |jtr<d}|jj}n	|jd urEd}d}| joO| |d j}|rZ| j|d j nd }	t rt |trk| |d  t|d |d |||	| j| j|||| jS | j| j|||| jd}
|d g|d g|g|gd	}|d g|gd
}|r|	|d< |	|d< |j| j|||
dd}|S )Nr   rM   r>   r<   r=   r   r   )mur   r   r   r6   r   ParamZGradVelocityZLearningRateZParamOutZVelocityOutMasterParamMasterParamOutTr&   inputsoutputsattrsstop_gradient)r   r   r@   r"   rC   _get_accumulator_masterrF   _create_param_lrr*   r+   rN   rM   r   r?   r,   rD   rE   r.   r   r   r#   r   Z	momentum_r'   r)   r-   	append_opr&   )r5   rJ   param_and_gradvelocity_acclrrP   r   r   find_mastermaster_weightr[   rY   rZ   Zmomentum_opr   r   r   _append_optimize_op   s   









zMomentum._append_optimize_opc                 C   s^  |  || |D ]}| | j|}| j}| j}t|dr2t|jtr)d}|jj	}n	|jdur2d}d}|j
tjkra| jd | | | jd | | | jd | | | jd | | q| |j
r| jd | | | jd | | | jr| jd | | j|j  nd| jd |< | jd | | | jd | | qtddS )	a  
        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, bf16, float32).
        This function will be overridden in the corresponding optimizer file.

        Args:
            target_block: the block in which the loss tensor is present
            parameters: list of parameter tensors for the optimizer
        rM   r>   Nr<   r=   r   FP16_LODTensorz\Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is LOD_TENSOR.)rL   r]   rF   r*   r+   rN   r   rM   r   r?   rE   rA   float32r0   appendr1   r3   r4   rD   r,   r2   r.   r   r    )r5   target_blockr   param_group_idxrP   ra   r   r   r   r   r   _multi_tensor_initL  sr   	




zMomentum._multi_tensor_initc                 C   s  t |tjsJ g g d}g g d}t |trz|D ]_}|d du r"q|d jdu rx|d jtjkrQ|d jt	j
jjkrQ|d |d  | |}|d | q| |d jrx|d jt	j
jjkrx|d |d  | |}|d | qn{|d D ]v}|d du rq~|d jdu ri }||d< |d	d
 | D  | |}|d jtjkr|d jt	j
jjkr|d |d  | |}|d | q~| |d jr|d jt	j
jjkr|d |d  | |}|d | q~ddg}	|	D ]}
t| j|
 | dkr| jo|
dk}| j|
 }|dur|| nd}t r| d}|r@t |t	jjtjjfr?| dd qt |t	jjtjjfrR| dd t| j|
 | ||
 | j |
 | ||
 || j!| j"| j#|
 | | j$|
 | || j%\}}}q| j|
 | ||
 | j |
 | ||
 d}| j|
 | | j |
 | d}| j!| j"| j#|
 | | j$|
 | d}|r| j|
 | |d< | j|
 | |d< ||d< |j&d|||dd qdS )zM
        For Multi Tensor, append optimize merged_operator to block.
        )r   rf   r   Nr   Fr   rf   paramsc                 S   s   i | ]\}}|d kr||qS )rl   r   ).0kvr   r   r   
<dictcomp>  s
    z=Momentum._append_optimize_multi_tensor_op.<locals>.<dictcomp>	found_infTrR   rU   )rQ   r   r   r   rV   rW   r6   Zmerged_momentumrX   )'r   r   r@   r!   r\   rE   rA   rg   r&   r   ZVarDescZVarTypeZ
LOD_TENSORrh   r^   rD   updateitemsrC   lenr0   r,   r2   r   Z_get_auxiliary_vareagerZTensorrB   ZOpResultZ_set_auxiliary_varr   Zmerged_momentum_r1   r'   r)   r3   r4   r-   r_   )r5   ri   Zparameters_and_gradsrj   Z	grad_dictZlr_dictr`   rb   Zparam_grad_dictZmulti_tensor_listkeyrc   rd   rq   _rY   rZ   r[   r   r   r    _append_optimize_multi_tensor_op  s
  	










z)Momentum._append_optimize_multi_tensor_opc                 C   sr   | d| jd | _| d| jd | _| d| jd | _| d| jd | _| d| jd | _| d}|S )Nr   r   r   r   r   rl   )getr/   r'   r)   r-   r*   r+   )r5   r   r   r   r   rC   !  s"   





zMomentum._update_param_group)
r   r   NFNNFr   FNr   )__name__
__module____qualname____doc__rF   r%   r#   rL   rO   re   rk   rx   rC   __classcell__r   r   r:   r   r
      s,    [DV@ r
   )rH   rA   r   Zpaddle.frameworkr   Zpaddle.regularizerr   baser   r   Z	optimizerr	   __all__r
   r   r   r   r   <module>   s   