o
    #j'                     @   s\   d dl Z d dlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 g Z
G d	d
 d
e	ZdS )    N)_C_ops   )	framework)no_grad)in_dynamic_mode   )	Optimizerc                       sP   e Zd ZdZdZdZ							d fdd		Zd
d Zdd Zdd Z	  Z
S )AdadeltaaU  
    **Notes: This API does not support sparse parameter optimization.**

    Adadelta Optimizer. Please refer to this for details:
    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.

    The update is done as follows:

    .. math::

        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2

        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }

        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2

    Args:
        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
        rho (float): a floating point value indicating the decay rate. Default 0.95.
        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in paramter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
            It canbe a float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
            >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
            >>> adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
            >>> back = out.backward()
            >>> adadelta.step()
            >>> adadelta.clear_grad()

            >>> # Note that the learning_rate of linear_2 is 0.01.
            >>> linear_1 = paddle.nn.Linear(10, 10)
            >>> linear_2 = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear_1(inp)
            >>> out = linear_2(out)
            >>> loss = paddle.mean(out)
            >>> adadelta = paddle.optimizer.Adadelta(
            ...     learning_rate=0.1,
            ...     parameters=[{
            ...         'params': linear_1.parameters()
            ...     }, {
            ...         'params': linear_2.parameters(),
            ...         'weight_decay': 0.001,
            ...         'learning_rate': 0.1,
            ...     }],
            ...     weight_decay=0.01)
            >>> out.backward()
            >>> adadelta.step()
            >>> adadelta.clear_grad()

    Z_avg_squared_gradZ_avg_squared_updateMbP?ư>ffffff?Nc                    st   |d u rt d|d u rt d|d u rt dt j|||||d d| _i | _d| _|| _|| _||d| _d S )Nzlearning_rate is not set.zepsilon is not set.zrho is not set.)learning_rate
parametersweight_decay	grad_clipnameFZadadelta)epsilonrho)	
ValueErrorsuper__init___multi_precision_master_weightstype_epsilon_rho_default_dict)selfr   r   r   r   r   r   r   	__class__ Z/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/optimizer/adadelta.pyr   p   s*   
zAdadelta.__init__c                 C   s   t |tjs
tdt |tr|d}|D ]P}|j| jv rq| jrC| 	|j
rC| |}| | j| | | j| | j|j q| 	|j
rQ| jsQtd | | j| | | j| | j|j qd S )N)block is not instance of framework.Block.paramszAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Lars optimizer.)
isinstancer   Block	TypeErrordictgetr   Z_already_create_accumulaterr   _is_dtype_fp16_or_bf16dtypeZ_create_master_weightZ_add_accumulator_avg_squared_grad_acc_str_avg_squared_update_acc_straddwarningswarn)r   blockr   pZmaster_pr    r    r!   _create_accumulators   s4   



zAdadelta._create_accumulatorsc           
      C   s@  t |tr
| |}| | j|d }| | j|d }| jo&| |d j}|r1| j	|d j
 nd }t rbt  t|d |d ||| ||| j| j|	 W d    d S 1 s[w   Y  d S t |tjsltd|d |d ||| |d}|d ||d}|r||d< ||d< |j| j||| j| j|dd	d
}	|	S )Nr   r   r"   )ParamZGradZAvgSquaredGradZAvgSquaredUpdateZLearningRate)ZParamOutZAvgSquaredGradOutZAvgSquaredUpdateOutZMasterParamZMasterParamOut)r   r   Zmulti_precisionT)r   inputsoutputsattrsZstop_gradient)r$   r'   _update_param_groupZ_get_accumulator_masterr+   r,   r   r)   r*   r   r   r   r   r   Z	adadelta_Z_create_param_lrr   r   r   r%   r&   Z	append_opr   )
r   r0   Zparam_and_gradZavg_squared_grad_accZavg_squared_update_accZfind_masterZmaster_weightr4   r5   Zadadelta_opr    r    r!   _append_optimize_op   st   





zAdadelta._append_optimize_opc                 C   s6   | d| jd | _| d| jd | _| d}|S )Nr   r   r#   )r(   r   r   r   )r   r   r    r    r!   r7      s   
zAdadelta._update_param_group)r
   r   r   NNNN)__name__
__module____qualname____doc__r+   r,   r   r2   r8   r7   __classcell__r    r    r   r!   r	      s    Q!Cr	   )r.   Zpaddler   baser   Zbase.dygraphr   r   Z	optimizerr   __all__r	   r    r    r    r!   <module>   s   