o
     jI                     @   st  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 ddl
mZmZ g dZddddZejZdad	d
 ZG dd dZe Zdd Z	d5ddZdd Zdd Zdd Zdd Zdd Zdd Zedd  Zd!d" Zd#d$ Zd%d& Z e	e	'					'd6d(d)Z!G d*d+ d+Z"d,d- Z#e						.	d7d/d0Z$	'					'd6d1d2Z%						.	d7d3d4Z&dS )8    N)core)_dygraph_tracerdygraph_only)signature_safe_contextmanager   )
black_list
white_list)ZFLAGS_cudnn_exhaustive_searchZFLAGS_conv_workspace_size_limitZ(FLAGS_cudnn_batchnorm_spatial_persistenti  c                   C      t S N)_g_amp_state_ r   r   U/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/amp/auto_cast.py	amp_state)   s   r   c                   @      e Zd Zdd Zdd ZdS )AMPGlobalStatec                 C   s   g | _ d| _d| _d| _d S )NFfloat32)model_parametersuse_master_grad$already_register_final_backward_hook	amp_dtype)selfr   r   r   __init__/   s   
zAMPGlobalState.__init__c                 C   s   || j |< d S r
   )__dict__)r   namevalr   r   r   __setattr__5   s   zAMPGlobalState.__setattr__N)__name__
__module____qualname__r   r   r   r   r   r   r   .   s    r   c                   C   r	   r
   )_amp_global_stater   r   r   r   amp_global_state<   s   r    O1float16c                 C   s   |dkrt  }t  }||fS tt | | }tt | | }| r3|r3| D ]
}||v r2tdq(| rH| D ]}||v rB|| || q7|r]|D ]}||v rW|| || qL||fS )zF
    Update black and white list according to users' custom list.
    O0z+Custom white list overlap custom black list)setcopyr   r   
ValueErrorremoveadd)custom_white_listcustom_black_listleveldtype_white_list_black_listZop_namer   r   r   _update_listB   s0   

r/   c                  C   s$   t  } | r| jtjjkrdS dS dS )zE
    Judge whether current code block is in `amp_guard` context.
    TF)r   
_amp_levelr   AmpLevelr!   tracerr   r   r   _in_amp_guarda   s   r4   c                  C   s   t  } | o| jtjjkS r
   )r   r0   r   r1   O2r2   r   r   r   _in_pure_fp16_guardo   s   r6   c                  C   s   t jj } | d dkS )z8
    Judge whether current gpu support float16 amp.
    r      )paddledevicecudaget_device_capability)propr   r   r   _is_gpu_float16_supportedt   s   r=   c                  C   sR   t jj } t j }|dur|dkrt|dd dk}nd}| d dko(|S )z9
    Judge whether current gpu support bfloat16 amp.
    NFalse.r      F   )r8   r9   r:   r;   versionintsplit)r<   cuda_versionZcuda_version_checkr   r   r   _is_gpu_bfloat16_supported|   s   
rF   c                 C   s~   d}| j s	d}|S t| tjjtjjtjjtjjtjjfr!d}|S | j	dks;|dkr=t| tjj
tjjtjjtjjfr=d}|S )NFTr"   )_cast_to_low_precison
isinstancer8   nnZ	BatchNormZBatchNorm1DZBatchNorm2DZBatchNorm3DZSyncBatchNormZ_dtypeZ	LayerNormZInstanceNorm1DZInstanceNorm2DZInstanceNorm3D)layerr,   need_keep_fp32r   r   r   rK      s8   

rK   c                 C   s  g }g }d}|d u rg }nGt |tjjr|g}n<t |tr(t|tjjr(|g}n,t |trP|D ]}t |tjjr>|| q/t|tjjrK|| q/t|nt|t	t
|D ]}|| jddD ]}d|_qdqZt|}t	t
| D ]}| | jddD ]
}t ||rd|_qqud S )Nz^excluded_layers must be either a nn.Layer instance/type or a list of nn.Layer instances/types.TZinclude_selfF)rH   r8   rI   Layertype
issubclasslistappend	TypeErrorrangelen	sublayersrG   tuple)modelsexcluded_layersZexcluded_layers_instancesZexcluded_layers_typeserror_messageitemidxrJ   r   r   r   set_excluded_layers   sD   


r\   c                 C   s   t | | tt| D ]3}| | jddD ](}t||rq|dkr5t|tjjj	tjjj
fr5|j|d q|j|ddd qq| S )NTrL   r"   )r,   F)r,   Zinclude_sublayersZfloating_only)r\   rS   rT   rU   rK   rH   r8   ZincubaterI   ZFusedFeedForwardZFusedMultiHeadAttentionZ_amp_decorateZ_to_impl)rW   r,   rX   r[   rJ   r   r   r   amp_initialize   s$   


r]   c                 C   sB   | D ]}t |tjjstdt|t |tjrtdqd S )NzRCurrent train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.zFor distributed AMP training, you should first use paddle.amp.decorate() to decotate origin model, and then call paddle.DataParallel get distributed model.)rH   r8   rI   rM   RuntimeErrorformatrN   ZDataParallel)rW   modelr   r   r   check_models   s   ra   c                 C   s$   ddl m}m} t| tjj||fS )Nr   DygraphShardingOptimizerDygraphShardingOptimizerV2)Upaddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizerrc   rd   rH   r8   	optimizerZ	Optimizer)rf   rc   rd   r   r   r   _is_valid_optimizer   s   rg   c                 C   s(   | D ]}t |stdt|qd S )Nz}Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.)rg   r^   r_   rN   )
optimizersrf   r   r   r   check_optimizers  s   ri   Tc              
   c   s   t  }t}|a| }|dvrtd| }| r"|dvr"tdt }|s+td| rF|j sF|j sF|j	 sFt
d|j  d} | r|j rX|dkrXt
d	 d} |j	 rh|dkrht
d
 d} |j r|dkrt stjj }	t
dtjj |	d |	d f  d} n'|dkrt stjj }	tj }
t
dtjj |	d |	d |
f  d} |}|t _|dkrtj}n|dkrtj}n|dkrtj}n|dkrtj}t||||\}}| stj}d}t jrt jsdd }tj | dt _|r*|j!}||_!|" \}}|#|| |j$}||_$|tjkr*|j%}||_%z$dV  W |rK|a||_!|#|| ||_$|tjkrM||_%dS dS dS |ri|a||_!|#|| ||_$|tjkri||_%w )a	  
    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
    If enabled, the input data type (float32 or float16) of each operator is decided
    by autocast algorithm for better performance.

    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
    imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.

    Args:
        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
        custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
             fp16 calculation and are considered numerically-safe and performance-critical. These ops
             will be converted to fp16.
        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
             calculation and are considered numerically-dangerous and whose effects may also be
             observed in downstream ops. These ops will not be converted to fp16.
        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
             O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
        dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.


    Examples:

        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle

            >>> data = paddle.uniform([10, 3, 32, 32], paddle.float32, -1, 1)
            >>> conv2d = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> conv2d = paddle.amp.amp_decorate(models=conv2d, level='O2')
            >>> with paddle.amp.amp_guard():
            ...     conv = conv2d(data)
            ...     print(conv.dtype)
            >>> # doctest: +SKIP("This has diff in xdoctest env")
            paddle.float16
            >>> # doctest: -SKIP
            ...
            >>> with paddle.amp.amp_guard(enable=False):
            ...     conv = conv2d(data)
            ...     print(conv.dtype)
            >>> # doctest: +SKIP("This has diff in xdoctest env")
            paddle.float32
            >>> # doctest: -SKIP
    )r#   ODr!   r5   z!level should be O0, OD, O1 or O2.r"   bfloat16z7If enable amp, dtype should be 'float16' or 'bfloat16'.;current_tracer is None, maybe it is not in imperative mode.zramp_guard can only be enabled on CUDAPlace, XPUPlace, and CustomPlace, current place is %s, so it makes no effect.Frl   z"XPUPlace only support float16 amp.z%CustomPlace only support float16 amp.r"   zFor float16, amp only support NVIDIA GPU with Compute Capability 7.0 or higher, current GPU is: %s, with Compute Capability: %d.%d.r   r   zFor bfloat16, amp only support NVIDIA GPU with Compute Capability 8.0 or higher and CUDA Version 11.0 or higher, current GPU is: %s, with Compute Capability: %d.%d, current CUDA Version is: %s.rj   r!   r5   r#   r   c                   S   s   t jt j dt _d S )NF)r   eagerZset_master_gradsr    r   r   r   r   r   r   master_grad_hook  s   z#amp_guard.<locals>.master_grad_hookTN)&localsr   upperr&   lowerr   _expected_placeis_gpu_placeis_xpu_placeis_custom_placewarningswarnr=   r8   r9   r:   r;   Zget_device_namerF   rB   r    r   	AMP_LEVELrj   r!   r5   r#   r/   r   r   r   rn   Z_add_backward_final_hookr0   Z_get_amp_op_listZ_set_amp_op_listZ
_amp_dtypeZ_use_promote)enabler)   r*   r+   r,   use_promoter   Zoriginal_stater3   r<   rE   r   Z	amp_levelr-   r.   ro   Zoriginal_amp_levelZoriginal_white_listZoriginal_black_listZoriginal_amp_dtypeZoriginal_use_promoter   r   r   	amp_guard  s   7




	
r|   c                   @   r   )StateDictHookc                 C   s
   || _ d S r
   )_save_dtype)r   
save_dtyper   r   r   r     s   
zStateDictHook.__init__c                 C   s>   |D ]}|| }t |rt || j}|j|_|||< qd S r
   )r8   Zis_floating_pointcastr~   r   )r   Z
state_dictkeyparamZparam_appliedr   r   r   __call__  s   
zStateDictHook.__call__N)r   r   r   r   r   r   r   r   r   r}     s    r}   c                 C   s@   ddl m}m} t| ||fr| jn| } t| dr|| _d S d S )Nr   rb   _multi_precision)re   rc   rd   rH   Z
_inner_opthasattrr   )rf   Zmulti_precisionrc   rd   r   r   r   _set_multi_precision  s   

r   Fc                 C   s  |dvrt d|dvrt d|dkr|du r| S | |fS t }|s't d|j s@|j s@|j s@|du r<| S | |fS |j rS|dkrS|du rO| S | |fS |j rf|dkrf|du rb| S | |fS |j r|d	krrt ry|dkrt s|du r| S | |fS d
}	t| t	j
jrd
}	| g} t|  nt| trt|  d}	ntdt| ||d |durd
}
t|rd
}
|g}t| nt|trt| d}
ntd|d
u}|D ]}t|| q|rdt _tt| D ]}t j| |   q|dur&|dvr	t d| tt| D ]}| | jddD ]
}|t| qq|	r=|dur;|
r5| |fS | |d fS | S |durS|
rK| d |fS | d |d fS | d S )ay  
    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm, InstanceNorm and LayerNorm.

    Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.

    Args:
        models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
        optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
             O2 represent Pure fp16/bf16, the decorator will cast all parameters of models to FP16/BF16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp)
        dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
        save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
             The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.

    Examples:

        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> # Demo1: single model and optimizer:
            >>> import paddle
            >>> paddle.device.set_device('gpu')

            >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> optimizer = paddle.optimizer.SGD(parameters=model.parameters())

            >>> model, optimizer = paddle.amp.amp_decorate(models=model, optimizers=optimizer, level='O2')

            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
            ...     output = model(data)
            ...     print(output.dtype)
            paddle.float16

            >>> # Demo2: multi models and optimizers:
            >>> model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())

            >>> models, optimizers = paddle.amp.amp_decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')

            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
            ...     output = models[0](data)
            ...     output2 = models[1](data)
            ...     print(output.dtype)
            ...     print(output2.dtype)
            paddle.float16
            paddle.float16

            >>> # Demo3: optimizers is None:
            >>> model3 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> optimizer3 = paddle.optimizer.Adam(parameters=model2.parameters())

            >>> model = paddle.amp.amp_decorate(models=model3, level='O2')

            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
            ...     output = model(data)
            ...     print(output.dtype)
            paddle.float16
    )r!   r5   zYlevel should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode.rk   z'dtype only support float16 or bfloat16.r!   Nrm   rl   r"   FTz9models must be either a single model or a list of models.)rW   r,   rX   zEoptimizers must be either a single optimizer or a list of optimizers.)r"   rl   r   Zfloat64zSsave_dtype can only be float16 float32 or float64, but your input save_dtype is %s.rL   r   )r&   r   rs   rt   ru   rv   r=   rF   rH   r8   rI   rM   ra   rP   rR   r]   rg   ri   r   r    r   rS   rT   r   extend
parametersrU   Zregister_state_dict_hookr}   )rW   rh   r+   r,   master_weightr   master_gradrX   r3   Zmodels_is_listZoptimizers_is_listZuse_multi_precisionoptr[   rJ   r   r   r   amp_decorate  s   M









r   c                 C   s   t | |||||S )a  
    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
    If enabled, the input data type (float32, float16 or bfloat16) of each operator is decided
    by autocast algorithm for better performance.

    Commonly, it is used together with `GradScaler` and `decorator` to achieve Auto-Mixed-Precision in
    imperative mode.

    Args:
        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
        custom_white_list(set|list|tuple, optional): A default white list is already set. Usually there is no need to set custom white list.
             The set of ops should be considered numerically-safe and performance-critical. These ops will be converted to float16/bfloat16.
        custom_black_list(set|list|tuple, optional): A default black list is already set. You can set a custom black list according to the model.
             The set of ops are considered numerically-dangerous and whose effects may also be observed in downstream ops. These ops will not be
             converted to float16/bfloat16.
        level(str, optional): Auto mixed precision level. Accepted values are "O1", "O2" and "OD": At the O1 level, operators in the white list
             will use float16/bfloat16 inputs for calculations, and operators in the black list will use float32 inputs for calculations. At the O2
             level, model's parameters will be casted to float16/bfloat16 by using `decorator`, and operators that have all float16/bfloat16 inputs
             will be converted to float16/bfloat16, and that have any float32 input will be converted to float32. For the OD level, operators in
             default white list will compute in float16/bfloat16, and the others will compute in float32. Default is O1.
        dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
        use_promote(bool, optional): Whether to promotes to fp32 when op has any float32 inputs. It is only supported when amp level is O2. Default is True.

    Examples:

        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle

            >>> conv2d = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.auto_cast():
            ...     conv = conv2d(data)
            ...     print(conv.dtype)
            >>> # doctest: +SKIP("This has diff in xdoctest env")
            paddle.float16
            >>> # doctest: -SKIP

            >>> with paddle.amp.auto_cast(enable=False):
            ...     conv = conv2d(data)
            ...     print(conv.dtype)
            >>> # doctest: +SKIP("This has diff in xdoctest env")
            paddle.float32
            >>> # doctest: -SKIP

            >>> with paddle.amp.auto_cast(custom_black_list={'conv2d'}):
            ...     conv = conv2d(data)
            ...     print(conv.dtype)
            >>> # doctest: +SKIP("This has diff in xdoctest env")
            paddle.float32
            >>> # doctest: -SKIP

            >>> a = paddle.rand([2, 3])
            >>> b = paddle.rand([2, 3])
            >>> with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
            ...     c = a + b
            ...     print(c.dtype)
            >>> # doctest: +SKIP("This has diff in xdoctest env")
            paddle.float16
            >>> # doctest: -SKIP

            >>> with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
            ...     d = a + b
            ...     print(d.dtype)
            >>> # doctest: +SKIP("This has diff in xdoctest env")
            paddle.float16
            >>> # doctest: -SKIP

    )r|   )rz   r)   r*   r+   r,   r{   r   r   r   	auto_cast  s   Or   c              	   C   s   t | |||||||S )aU  
    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
    When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm, InstanceNorm and LayerNorm.

    Commonly, it is used together with `auto_cast` to achieve Pure float16/bfloat16 in imperative mode.

    Args:
        models(Layer|list of Layer): The defined models by user, models must be either a single model or a list of models. Default is None.
        optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
        level(str, optional): Auto mixed precision level. Accepted values are 'O1' and 'O2': O1 represent mixed precision, the decorator will do nothing;
             O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp)
        dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
        save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
             The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
        master_grad(bool, optional): For level='O2', whether to use float32 weight gradients for calculations such as gradient clipping, weight decay, and weight updates. If master_grad is enabled, the weight
             gradients will be float32 dtype after the backpropagation. Default is False, there is only float16 weight gradients.
        excluded_layers(Layer|list of Layer, optional): Specify the layers not to be decorated. The weights of these layers will always keep float32 when level is O2. `excluded_layers` can be specified as
             an Layer instance/type or a list of Layer instances/types. Default is None, the weights of the whole model will be casted to float16 or bfloat16.

    Examples:

        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> # Demo1: single model and optimizer:
            >>> import paddle
            >>> paddle.device.set_device('gpu')

            >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> optimizer = paddle.optimizer.SGD(parameters=model.parameters())

            >>> model, optimizer = paddle.amp.decorate(models=model, optimizers=optimizer, level='O2')

            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
            ...     output = model(data)
            ...     print(output.dtype)
            paddle.float16

            >>> # Demo2: multi models and optimizers:
            >>> model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())

            >>> models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')

            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
            ...     output = models[0](data)
            ...     output2 = models[1](data)
            ...     print(output.dtype)
            ...     print(output2.dtype)
            paddle.float16
            paddle.float16

            >>> # Demo3: optimizers is None:
            >>> model3 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
            >>> optimizer3 = paddle.optimizer.Adam(parameters=model3.parameters())

            >>> model = paddle.amp.decorate(models=model3, level='O2')

            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
            ...     output = model(data)
            ...     print(output.dtype)
            paddle.float16

    )r   )rW   rh   r+   r,   r   r   r   rX   r   r   r   decorate  s   Qr   )r!   r"   )TNNr!   r"   T)Nr!   r"   NNFN)'r%   rw   r8   Zpaddle.baser   Zpaddle.base.frameworkr   r   Zpaddle.base.wrapped_decoratorr   Z	amp_listsr   r   ZAMP_RELATED_FLAGSZAMP_RELATED_FLAGS_SETTINGr1   ry   r   r   r   r   r    r/   r4   r6   r=   rF   rK   r\   r]   ra   rg   ri   r|   r}   r   r   r   r   r   r   r   r   <module>   s   
$#

 H J
V