o
    "jb`                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ ddlmZ g Zd	d
 Zdd Zdd Ze jdd ZG dd deZ	dddZdd Zdd ZdS )    N)	framework)PyLayer)EagerParamBase)get_rng_state_tracker)corein_dynamic_mode   )loggerc                 C   s6   t | j}td| j| j| jd|}| | |S )N)shapedtypename )copydeepcopy__dict__r   r
   r   r   _share_buffer_to)paramstate	new_paramr   r   m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/fleet/recompute/recompute.py_varbase_help!   s   
r   c                 C   s   g }| D ]p}t |tjjs"t|tust |d tjjs"|| qt |tr/|t| qt|tu rgg }|D ]%}t |tjjsDJ t |trQ|t| q9|	 }|j
|_
|| q9|t| q|	 }|j
|_
|| qt|S )Nr   )
isinstancer   eagerTensortypetupleappendr   r   detachstop_gradient)inputsoutinpZ
detach_inpiZtmp_ixr   r   r   detach_variable*   s0   


r$   c                 C   s   g }| D ].}t |tjjtjfr||j qt|tu r2|D ]}t |tjjtjfr1||j qqt	|r>t
d d S d S )Nz[Recompute]: None of the inputs to current recompute block need grad, therefore there is NO need to recompute this block in backward !)r   r   r   r   paddler   r   r   r   allr	   warning)r   Znecessary_for_each_inputZinput_r"   r   r   r   check_recompute_necessaryN   s   r(   c              	   c   sf    t  }t  }t |  t | zd V  W t | t | d S t | t | w N)r%   get_rng_stater   get_states_trackerZset_rng_stateZset_states_tracker)Z	rng_statetrackerZorig_rng_stateZorig_rng_trackerr   r   r   swith_rng_state_tracker_   s   



r-   c                   @   s$   e Zd Zedd Zedd ZdS )RecomputeFunctionc                 O   s&  || _ || _|| _g | _g | _dd tt|D | _g }t|D ]k\}}t	
|r<|| | j| | jd  q!t|tu rdd |D }t|rwdd |D }	t|	s`t|	r`td|| | j| d| j|< | jd  q!t|rtd| j| q!| j| q!| j|  | jrt	 | _t  | _t }
|
jtjjkrdnd| _|
jtjjkrd	| _n|
jtjj tjjfv rd
| _ntd|
j |
j!dkrd| _"n|
j!dv rd| _"ntd|
j! |
# \| _$| _%t	&  ||i |}W d    |S 1 sw   Y  |S )Nc                 S   s   g | ]}d qS )Fr   ).0_r   r   r   
<listcomp>{   s    z-RecomputeFunction.forward.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   )r%   	is_tensorr/   ar   r   r   r1      s    c                 S   s   g | ]}|j qS r   r   r3   r   r   r   r1      s    zJRecompute receive a tuple containing tensor holds different stop gradient.TzHRecompute receive a tuple containing tensor and non-tensor at same time.FO2O1zunsupported amp level: float16bfloat16Zfloat32r:   zunsupported amp dtype: )'run_functionpreserve_rng_statekwargsr   tensor_indicesrangelenduplicate_tensor	enumerater%   r2   r   r   r   r&   any
ValueErrorZsave_for_backwardr*   fw_rng_stater   r+   fwd_rng_state_trackerr   _dygraph_tracer
_amp_levelr   AmpLevelO0is_fw_autocastr6   	amp_levelr7   
_amp_dtype	amp_dtype_get_amp_op_listamp_white_listamp_black_listZno_grad)ctxr;   r<   argsr=   Ztensor_inputsr"   argZ
is_tensorsZtensors_stop_gradienttraceroutputsr   r   r   forwardm   sr   










zRecomputeFunction.forwardc              
   G   s  t jj a t| j}| j}| j}|  }t	|D ]
\}}|| ||< qt
 }d|_| jrwt| j| j7 t jj| j| j| j| j| jd tt|}	| j|	i | j}
W d    n1 sbw   Y  W d    n1 sqw   Y  n/t jj| j| j| j| j| jd tt|}	| j|	i | j}
W d    n1 sw   Y  t|
tjjr|
f}
t |
t |ksJ g }g }t!t |
D ]}t|
| tjjr|
| j"s|#|
|  |#||  qt |dkrt$dt jjdd t j%&|| W d    n	1 sw   Y  g }t	|	D ]?\}}t|tjjr'|#|'  qt(|tu rP|| rPt)dd |D rD|#d  q|#td	d |D  qt* r[t|}nt|}|W  d    S 1 slw   Y  d S )
NTenableZcustom_white_listZcustom_black_listlevelr   r   zHnone of output has requires_grad=True, this recompute() is not necessaryF)rY   c                 s   s    | ]}|j V  qd S r)   r5   r/   r"   r   r   r   	<genexpr>  s    z-RecomputeFunction.backward.<locals>.<genexpr>c                 s   s    | ]}|  V  qd S r)   )
_grad_ivarr[   r   r   r   r\     s    )+r%   baseZdygraphguardlistr   r>   rA   Zsaved_tensorrB   r   rG   	_has_gradr<   r-   rE   rF   amp	auto_castrK   rP   rQ   rL   rN   r$   r   r;   r=   r   r   r   r   r@   r?   r   r   RuntimeErrorautogradbackwardr]   r   r&   r   )rR   rS   r   r>   rA   Ztensorsr"   idxrU   Zdetached_inputsrV   Zforward_outputs_with_gradZbackward_inputs_with_gradZgradsr!   r   r   r   rf      s   


&zRecomputeFunction.backwardN)__name__
__module____qualname__staticmethodrW   rf   r   r   r   r   r.   l   s
    
Qr.   Tc           	         sp  r6t  }d|v rt  n"d|v rt  n|dd t j v r*t |ntd|t	 
 t }|jtjjkrCdnd
|jtjjkrOdn|jtjjtjjfv r\d	|jd
krdd
n|jdv rkd| \G dd d t g 	 	fdd}	
fdd}t j|| i }W d   |S 1 sw   Y  |S )z
    recompute without reentrant, that means use hook to implement the recompute function rather than re-entrant autograd.
    zgpu:zxpu::r   z>Recompute with RNG perserve is not support current device: {}.FTr6   r7   r8   r9   r:   c                   @   s   e Zd ZdS )z9_recompute_without_reentrant.<locals>.Intermediate_HolderN)rh   ri   rj   r   r   r   r   Intermediate_HolderG  s    rm   c                    s     } t| |S r)   )r   weakrefref)r#   res)rm   holder_listr   r   packM  s   z*_recompute_without_reentrant.<locals>.packc                    s  d t dkrȇ	 fdd}dd }rztV tdA tjj
d& tj|| i }W d    n1 sGw   Y  W d    n1 sVw   Y  W d    n1 sew   Y  W d    n1 stw   Y  nNtdA tjj
d& tj|| i }W d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  | vrtd|  S )	Nr   c                    sd   d7  d   d u rd S t j| j| j| jd t jjj| j	}| 
| | d   < d S )N   Zcpy)r   r   r   r   r
   r   ZVarDescZVarTypeZ
LOD_TENSORZpersistabler   )inner_xZ
tmp_tensor)rq   storageunpack_counterr   r   
inner_packV  s   
z@_recompute_without_reentrant.<locals>.unpack.<locals>.inner_packc                 S   s   t d)Nz*An unexcepted backward called on a tensor!)	Exception)rt   r   r   r   inner_unpackh  s   zB_recompute_without_reentrant.<locals>.unpack.<locals>.inner_unpackTrX   zaNot supported to retrieve a tensor saved by autograd multiple times that is no need to recompute.)	r@   r-   r%   Zset_grad_enabledrb   rc   re   saved_tensors_hooksrx   )r#   rw   ry   Zunused_outputs)rQ   rN   rL   rP   rS   functionfw_cuda_rng_statefwd_cuda_rng_state_trackerrq   rK   r=   r<   ru   )rv   r   unpackR  sd   	  z,_recompute_without_reentrant.<locals>.unpackN)r%   Z
get_deviceZget_cuda_rng_stater*   splitZdeviceZget_all_custom_device_typerd   formatr   r+   r   rG   rH   r   rI   rJ   r6   r7   rM   rO   rn   WeakKeyDictionaryre   rz   )	r{   r<   rS   r=   Z
cur_devicerU   rr   r~   rV   r   )rm   rQ   rN   rL   rP   rS   r{   r|   r}   rq   rK   r=   r<   ru   r   _recompute_without_reentrant  sN   



$<
r   c                 O   sj   | dd}| dd}|r|rtdt jrt| |r)tj| |g|R  S t| |g|R i |S )a   
    recompute intermediate activations to save then memory.

    Parameters:
        function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
              whose intermediate activations will be released to save memory in forward stage and will be recomputed
              in backward stage for gradient calculation.
        *args(Tensor): inputs to the function.
        **kwargs(Dict): Kwargs should only contain two kinds of key-value params, the one is part of function's key-value params,
                        and the other contains 'preserve_rng_state' and 'use_reentrant'. the key-value pair of preserve_rng_state,
                        which is used to indicate whether to save the forward rng. If it is True, then the last forward rng value
                        will be restored when the forward recalculation of backpropagation is performed, its default value is True.
                        the key-value pair of use_reentrant is used to indicate which implementation of recompute you will be used.
                        'use_reentrant=True' means to use the PyLayer implementation of recompute, 'use_reentrant=False' means to
                        use the Hook implementation of recompute, its default value is True.
    Returns:
        Output of function on args.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:DISTRIBUTED, env:GPU)
            >>> import paddle
            >>> from paddle.distributed.fleet.utils import recompute
            >>> import random
            >>> paddle.seed(2023)
            >>> def get_fc_block(block_idx, input_size, is_last=False):
            ...     block_name = "block_" + str(block_idx)
            ...     block = paddle.nn.Sequential(
            ...         (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
            ...         (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
            ...         (block_name + "_relu_1", paddle.nn.ReLU()),
            ...         (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
            ...         (block_name + "_relu_2", paddle.nn.ReLU()),
            ...     )
            ...     if is_last:
            ...         block.add_sublayer(
            ...             block_name + "_fc_2",
            ...             paddle.nn.Linear(
            ...                 input_size, 1, bias_attr=False
            ...             )
            ...         )
            ...     else:
            ...         block.add_sublayer(
            ...             block_name + "_fc_2",
            ...             paddle.nn.Linear(input_size, input_size, bias_attr=False)
            ...         )
            ...     return block

            >>> class Naive_fc_net(paddle.nn.Layer):
            ...     def __init__(self, input_size=10,
            ...                 recompute_blocks=[1, 3],
            ...                 recompute_kwargs={}):
            ...         super().__init__()
            ...         self.recompute_blocks = recompute_blocks
            ...         self.recompute_kwargs = recompute_kwargs
            ...         self.runfunc0 = get_fc_block(0, input_size, is_last=False)
            ...         self.runfunc1 = get_fc_block(1, input_size, is_last=False)
            ...         self.runfunc2 = get_fc_block(2, input_size, is_last=False)
            ...         self.runfunc3 = get_fc_block(3, input_size, is_last=False)
            ...         self.runfunc4 = get_fc_block(4, input_size, is_last=True)
            ...         self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
            ...     def forward(self, inputs):
            ...         nums = len(self.total_func)
            ...         for i in range(nums):
            ...             if i in self.recompute_blocks:
            ...                 inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
            ...             else:
            ...                 inputs = self.total_func[i](inputs)
            ...         return inputs

            >>> def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
            ...     gen = paddle.seed(10)
            ...     gen.manual_seed(10)
            ...     random.seed(10)
            ...     if cuda_state:
            ...         paddle.set_cuda_rng_state(cuda_state)
            ...     batch_size, input_size = 1, 10
            ...     model = Naive_fc_net(
            ...         input_size,
            ...         recompute_blocks=recompute_block,
            ...         recompute_kwargs=recompute_kwargs)
            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
            ...     loss_ = []
            ...     param_ = []
            ...     grad_ = []
            ...     for _ in range(5):
            ...         x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
            ...         y_pred = model(x)
            ...         loss = y_pred.mean()
            ...         loss_.append(loss.item())
            ...         loss.backward()
            ...         optimizer.step()
            ...         param_.append(model.parameters()[9])
            ...         grad_.append(model.parameters()[3]._grad_ivar())
            ...         optimizer.clear_grad()
            ...     return loss_, param_, grad_

            >>> cuda_state = paddle.get_cuda_rng_state()
            >>> # without recompute
            >>> loss_ref, param_ref, grad_ref = run_model(
            ...     cuda_state, recompute_block=[]
            ... )

            >>> loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
            >>> print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
            >>> # The result of the recompute_loss should be the same as the normal_loss.
            normal_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0], recompute_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0]

    r<   Tuse_reentrantz^Error, if you want to send kwargs(dict parameter) to function, please set use_reentrant=False.)	poprD   r   rG   ra   r(   r.   applyr   )r{   rS   r=   Zpreserver   r   r   r   	recompute  s   p
r   c           
      O   s   |  dd}|  dd}dd }t|tjjrt| }t|| }d}td||d  |D ]}	|	| d }t	||	||g|R d|i|}q/||d t|d ||S )	a  
    recompute intermediate activations to save the memory for 'Sequential' models. use 'ctx' to transmit some context params, it is similar to 'recompute_hybrid' API.

    Parameters:
        ctx(dict): include 'segments' and  'preserve_rng_state' keys, the key 'segments' (int, default 1), represents the number of chunks to create in the model,
                   the key 'preserve_rng_state' (bool, optional, default=True) indicate whether to save the forward rng. If it is True, then the last forward rng value will be
                   restored when the forward recalculation of backpropagation is performed.
        functions(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
              whose intermediate activations will be released to save memory in forward stage and will be recomputed
              in backward stage for gradient calculation.
        *args(Tensor): inputs(tuple) to the function.
        **kwargs(Dict): inputs(dict) to the function.

    Returns:
        Output of function on args and kwargs.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
            >>> import paddle
            >>> from paddle.incubate.distributed.fleet import recompute_sequential
            >>> input = paddle.ones(shape=[8, 10])
            >>> model = paddle.nn.Sequential(paddle.nn.Linear(10, 10), paddle.nn.Linear(10, 2))
            >>> output = recompute_sequential({'segments' : 1}, model, input)

    segmentsrs   r<   Tc                    s    fdd}|S )Nc                    s$   t  d D ]}| | } q| S )Nrs   )r?   )inputr"   beginendfuncsr   r   do_run7  s   z7recompute_sequential.<locals>._run_func.<locals>.do_runr   )r   r   r   r   r   r   r   	_run_func6  s   z'recompute_sequential.<locals>._run_funcr   )
getr   r%   nnZ
Sequentialr`   childrenr@   r?   r   )
rR   Z	functionsrS   r=   r   r<   r   Zsegment_sizer   r   r   r   r   recompute_sequential  s&   
r   )T)
contextlibr   rn   r%   r   Zpaddle.autogradr   Zpaddle.base.frameworkr   Z=paddle.distributed.fleet.meta_parallel.parallel_layers.randomr   Zpaddle.frameworkr   r   Zutils.log_utilr	   __all__r   r$   r(   contextmanagerr-   r.   r   r   r   r   r   r   r   <module>   s.   	$
 4
v 