o
    #j?1                     @   sX   d dl mZ d dlmZ ddlmZmZ ddlmZ ddl	m
Z
 g ZG dd	 d	e
Zd
S )    )_C_ops)global_scope   )core	framework)Variable   )	Optimizerc                       sr   e Zd ZdZdZdZdZdZ						
						d fdd	ZdddZ	dd Z
dd Zdd Zdd Z  ZS )Lambab  
    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

    LAMB Optimizer is designed to scale up the batch size of training without losing
    accuracy, which supports adaptive element-wise updating and accurate layer-wise
    correction. For more information, please refer to `Large Batch Optimization for
    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .

    The updating of parameters follows:

    ..  math::

        m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t

        v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2

        m_t &= \frac{m_t}{\beta_1^t}

        v_t &= \frac{v_t}{\beta_2^t}

        r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}

        w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})


    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
    learning rate, :math:`\\lambda` the LAMB weight decay rate.

    Args:
        learning_rate (float|Variable, optional): the learning rate used to update parameters. \
            Can be a float value or a Variable with data type float32. Default 0.001.
        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
            Default 0.9.
        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
            Default 0.999.
        epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
        parameters (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_base_clip_ClipGradByNorm` ,
            :ref:`api_paddle_base_clip_ClipGradByValue` ). If you want better convergence, it is recommended
            to use :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
        exclude_from_weight_decay_fn (function, optional): whether to skip weight decay for a parameter when this function returns True while take the parameter as input.
        always_adapt (bool, optional): whether to use Layer-wise LR adaptation. By default, skip adaptation on parameters that are
            excluded from weight decay, unless always_adapt == True, then always enable LR adaptation.
        name(str|None): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
            >>> beta2 = paddle.to_tensor([0.85], dtype="float32")
            >>> lamb = paddle.optimizer.Lamb(learning_rate=0.002, parameters=linear.parameters(), lamb_weight_decay=0.01)
            >>> back = out.backward()
            >>> lamb.step()
            >>> lamb.clear_grad()

    moment1moment2beta1_pow_accbeta2_pow_accMbP?{Gz??+?ư>NFc                    s   |d usJ |d usJ |d usJ |d usJ t  j||d ||d d| _|| _|| _|| _|| _|| _|||||d| _i | _	i | _
|	| _|
| _d S )N)learning_rate
parametersweight_decay	grad_clipnameZlamb)beta1beta2epsilonlamb_weight_decayexclude_from_weight_decay_fn)super__init__type_beta1_beta2_epsilon_lamb_weight_decay_exclude_from_weight_decay_fn_default_dict_master_weights_used_master_weights_multi_precisionalways_adapt)selfr   r   r   r   r   r   r   r   multi_precisionr*   r   	__class__ V/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/optimizer/lamb.pyr   e   s6   
zLamb.__init__c                 C   sz   |d u rt  }|| }| j|}|d ur7|| }| | ks)J | | ks3J ||fS d }||fS N)r   Zfind_varZ
get_tensorr(   getZ_dtypeshape)r+   r   scopeZp_tZmaster_nameZ
master_p_tr/   r/   r0   _get_parameter   s   zLamb._get_parameterc                 C   s   t |tjsJ t |tr| |}|D ]0}|j| jv rq| jr8| |j	r8| 
|}| | | j|j q| | | j|j qd S r1   )
isinstancer   Blockdict_update_param_groupr   Z_already_create_accumulaterr)   _is_dtype_fp16_or_bf16dtypeZ_create_master_weight_add_moments_powsadd)r+   blockr   pZmaster_pr/   r/   r0   _create_accumulators   s   




zLamb._create_accumulatorsc              	   C   s   |j }| |rtjjj}| j| j||d | j| j||d | j| j	||t
| jtr-dn| jdgtjjjdd | j| j||t
| jtrGdn| jdgtjjjdd d S )N)r;   r   r   cpu)r   paramr;   Z
fill_valuer3   r    Zdevicer   )r;   r:   r   ZVarDescZVarTypeZFP32Z_add_accumulator_moment1_acc_str_moment2_acc_str_beta1_pow_acc_strr6   r!   r   Z
LOD_TENSOR_beta2_pow_acc_strr"   )r+   r?   Z	acc_dtyper/   r/   r0   r<      s6   




zLamb._add_moments_powsc                 C   s  t |tjsJ t |tr| |}d|j_| | j|d }| | j	|d }| | j
|d }| | j|d }| jd urI| |d rId}n| j}| |}| jo[| |d j}	|d j}
|	ro| j|
 }|j| j|
< nd }t rt|d |d ||||||d || j| j| j| j|	 d S |d |d |||||d}|d ||||d}| j| j| j|| j|	d}|	r||d< ||d	< | d
}|r||d< |j| j|||dd}|S )NTr   g        r   )ParamZGradZLearningRateZMoment1ZMoment2ZBeta1PowZBeta2Pow)ZParamOutZ
Moment1OutZ
Moment2OutZBeta1PowOutZBeta2PowOut)r   r   r   r   r*   r,   ZMasterParamZMasterParamOut	found_infZ
SkipUpdate)r    inputsoutputsattrsZstop_gradient)r6   r   r7   r8   r9   programZ	_use_lambZ_get_accumulator_masterrC   rD   rE   rF   r%   r$   Z_create_param_lrr)   r:   r;   r   r'   r(   Zin_dygraph_moder   Zlamb_r!   r"   r#   r*   Z_get_auxiliary_varZ	append_opr    )r+   r>   Zparam_and_gradr   r   r   r   r   lrZfind_masterZp_nameZmaster_weightrI   rJ   rK   rH   Zlamb_opr/   r/   r0   _append_optimize_op   s   











	
zLamb._append_optimize_opc                 C   sr   | d| jd | _| d| jd | _| d| jd | _| d| jd | _| d| jd | _| d}|S )Nr   r   r   r   r   params)r2   r&   r!   r"   r#   r$   r%   )r+   r   r/   r/   r0   r9   6  s   

zLamb._update_param_group)r   r   r   r   r   NNNFFNr1   )__name__
__module____qualname____doc__rC   rD   rE   rF   r   r5   r@   r<   rN   r9   __classcell__r/   r/   r-   r0   r
      s.    F
,gr
   N)Zpaddler   Zpaddle.base.executorr   baser   r   Zbase.frameworkr   Z	optimizerr	   __all__r
   r/   r/   r/   r0   <module>   s   