o
    "j+                     @   s   d dl Z d dl mZ d dlmZ d dlmZ ddlmZ ddlm	Z	 dd	l
mZmZmZ g Zd
d Zdd ZG dd deZdd ZdS )    N)	framework)PyLayer)core   )get_rng_state_tracker)utils   )check_recompute_necessarydetach_variableswith_rng_state_trackerc           	      C   sz   |j }|j}|dk r| S t| }|dksJ d|| dks'J d|||  }|| }|| }|| }||| S )Nr   r   zcan't recompute zero elementzHThe capacity of the activation ({}) cannot be divisible by mp_degree({}))nranksrankpaddleZnumelformatZflatten_)	tensormp_group	mp_degreemp_rankZtensor_numeldataZ	part_sizestartend r   t/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/fleet/recompute/recompute_hybrid.py_split_activation   s    
r   c                 C   s`   |j }|j}|dk r| S t| j}|d  |j 9  < t|| j}|j| 	 |}|
  |S )Nr   r   )r   r   listshaper   emptydtypeZprocess_groupZ
all_gathercudawait)r   r   r   r   Ztensor_shapeouttaskr   r   r   _merge_activation6   s   
r"   c                   @   s(   e Zd ZdZedd Zedd ZdS )_HPRecomputeFunctionaz  
    Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
    1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
    2. Offload support for activation
    3. Support MP segmentation of activation to further reduce cuda memory
    4. Adapt to the random state of MP
    c                 O   s,  || _ || _t | _t  | _|| _|| _	|| _
g | _g | _g | _g }t }	dt v sIdt v sI|	dd tj v sIJ d|	 dt }
|
jtjjkrVdnd| _|
jtjjkrdd	| _n|
jtjjtjjfv rsd
| _ntd|
j |
j| _|
 \| _ | _!t"  ||i |}W d    n1 sw   Y  t#|D ]U\}}t$|r|j%}|r| j&|j' t(|) |* }|r|+ n|}n|r|+ n|}||_%|&| | j&| | j&d  t, r|r| -| q| j&| q| j.|  t$|r||g7 }|S ||7 }t/|S )Nzgpu:zxpu::r   z2Recompute with RNG is not support current device: .FTO2O1zunsupported amp level: )0run_functionkwargsr   Zget_rng_statefwd_rng_stater   Zget_states_trackerfwd_rng_state_trackerr   offload	partitioninputstensor_indicestensor_shapesZ
get_devicesplitZdeviceZget_all_custom_device_typer   _dygraph_tracerZ
_amp_levelr   ZAmpLevelZO0is_fw_autocastr&   	amp_levelr'   
ValueErrorZ
_amp_dtype	amp_dtypeZ_get_amp_op_listamp_white_listamp_black_listZno_grad	enumerate	is_tensorstop_gradientappendr   r   detachclonecpuZin_dynamic_modeZmark_non_differentiableZsave_for_backwardtuple)ctxr(   all_outputsr   r,   r-   argsr)   Ztensor_inputsZ
cur_devicetraceroutputsiargstater   r   r   forwardN   st   







z_HPRecomputeFunction.forwardc              
   G   sB  t jj  t| j}| j}| j}t|  }t j	
 j}t|D ]2\}}| jrE|| j}	t|| | j || ||< |	|| _| jrO|| |n|| ||< q#t }
d|
_t| j| jJ | jrt jj| j| j| j| j | j!d t"t#|}| j$|i | j%}W d    n1 sw   Y  nt"t#|}| j$|i | j%}W d    n1 sw   Y  t&|t'j(j)r|f}t*|t*|ksJ g }g }t+t*|D ]}t&|| t'j(j)r|| js|,||  |,||  qt*|dkrt-dt j./|| t#dd |D }|W  d    S 1 sw   Y  d S )NT)enableZcustom_white_listZcustom_black_listlevelr   r   zInone of output has stop_gradient=False, this recompute() is not necessaryc                 s   s&    | ]}t |tjjr| V  qd S )N)
isinstancer   eagerTensorZ
_grad_ivar).0inpr   r   r   	<genexpr>   s    
z0_HPRecomputeFunction.backward.<locals>.<genexpr>)0r   baseZdygraphguardr   r.   r/   r0   Zsaved_tensordistributedZParallelEnv	device_idr9   r-   r;   r"   r   r=   Zreshape_r,   r   r   r2   	_has_gradr   r*   r+   r3   ampZ	auto_castr7   r8   r4   r6   r
   r@   r(   r)   rL   r   rM   rN   lenranger<   RuntimeErrorZautogradbackward)rA   rC   r.   r/   r0   ZtensorsrU   rF   idxrH   rD   Zdetached_inputsrE   Zforward_outputs_with_gradZbackward_inputsZgradsr   r   r   r[      s   



&z_HPRecomputeFunction.backwardN)__name__
__module____qualname____doc__staticmethodrI   r[   r   r   r   r   r#   E   s    
_r#   c           	      O   s   |  dd}|dusJ d|  dd}|  dd}t jr#t| g }tj|||||g|R i | t|dkr?|d S |D ]}t	|rPt
|sPd	|_qAt|S )
ah  
    recompute intermediate activations to save the memory in hybrid parallel scene.
    # NODTE(shenliang03)The current hybrid parallel recompute has limitations.
    # It cannot handle the following situations:
    # 1. The calculation output of recompute, there are tensors that do not require gradients.
    # 2. The forward output tensor has no gradient. This problem can be solved temporarily by detach().
    # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor

    Parameters:
        ctx(dict): include 'mp_group', 'offload', and 'partition' keys. the key 'mp_group' (Group), represents the avtivations are splitted
                   in which group. the key 'offload' (bool, optional, default=False), represents whether to offload to cpu. the key 'partition' (bool, optional, default=False),
                   represents whether to split activations in the mp_group.
        function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
              whose intermediate activations will be released to save memory in forward stage and will be recomputed
              in backward stage for gradient calculation.
        *args(Tensor): inputs(tuple) to the function.

        **kwargs(Dict): inputs(dict) to the function.

    Returns:
        Output of function on args and kwargs.

    r   Nz8ctx must contains mp_group and mp_group can not be None.r,   Fr-   r   r   T)getr   r2   rV   r	   r#   applyrX   r   r:   r   Zis_float_tensorr;   r@   )	rA   functionrC   r)   r   r,   r-   rB   outputr   r   r   recompute_hybrid   s.   


rf   )r   r   Zpaddle.autogradr   Zpaddle.frameworkr   Z$meta_parallel.parallel_layers.randomr   Zmeta_parallel.pp_utilsr   Z	recomputer	   r
   r   __all__r   r"   r#   rf   r   r   r   r   <module>   s    6