o
    #jK                     @   s   d dl Z d dlm  mZ d dl mZmZ d dlmZ d dl	m
Z
 dadadae
dddZd	d
 Z				dddZdd Zdd Z			ddddddddZ							dddZ					dddZdS )    N)_C_opsin_dynamic_mode)LayerHelper)signature_safe_contextmanagerFTc                 c   sB    t }t }t}| a |a|azdV  W |a |a|adS |a |a|aw )z
    With the sdp_kernel context manager, different algorithm implementations can
    be selected for scaled_dot_product_attention.
    N)g_enable_mathg_enable_mem_efficientg_enable_flash)Zenable_mathZenable_flashZenable_mem_efficientZoriginal_enable_mathZoriginal_enable_flashZoriginal_enable_mem_efficient r	   e/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/nn/functional/flash_attention.py
sdp_kernel   s   r   c                 C   s*   t | d}d|_t j|dd}d|_|S )Ng     T   )Zdiagonal)paddleZ	full_likeZstop_gradientZtriu)xmaskr	   r	   r
   get_triangle_upper_mask1   s
   r           c                 C   s   | j d }t| g d} t|g d}t|g d}tj| |d  |dd}|s1t|}	nt }
d|
v rGt|}|| }t|}	ntj	|}	|dkrZtj
|	||dd	}	t|	|}t|g d}||rn|	fS d
fS )z
    This is a basic implementation of scaled dot product attention composed of
    combinations of fundamental components.
    )r      r      g      T)r   yZtranspose_yZxpur   Zupscale_in_train)trainingmodeN)shaper   Z	transposematmulFsoftmax
get_devicer   ZincubateZ softmax_mask_fuse_upper_triangledropout)querykeyvaluedropout_ratecausalreturn_softmaxr   head_dimproductweightsplacer   outr	   r	   r
   _math_attention9   s,   
r)   c                 C   s   | dkrdS dS )N   
flash_attnmem_efficientr	   )r$   r	   r	   r
   _select_sdp_cudad   s   r-   c                 C   s   t  }tdu rd|vrdS t| S tdu r"tdu r"tdu r"tdtdu r6tdu r0tdu r0dS d|vr6dS tdu rBtdu rBt| S tdu rHdS dS )	z
    There are currently three different implementation options available for
    scaled dot product attention, and the chosen approach depends on whether it
    is determined by the sdp_kernel configuration or specified through input values.
    NZgpumathFz@No available backend for scaled_dot_product_attention was found.Tr+   r,   )r   r   r   r-   r   r   AssertionError)r$   r'   r	   r	   r
   _select_sdpk   s*   r0    )fixed_seed_offsetrng_namer   namec                C   s6  | j d }
t|
}|dkrwt r,t| |||d|||| |
\}}}}||r)|fS dfS tdi t }|jdd}||}||}|t	j
}|t	j}| |||d}||||d}|jd|||||| |dd	 ||rt|fS dfS |d
krddlm} || ||d|d|d}|dfS t| ||||||dS )a4  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API is only support inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        dropout(float): The dropout ratio.
        causal(bool): Whether enable causal mode.
        return_softmax(bool): Whether to return softmax.
        fixed_seed_offset(Tensor, optional): With fixed seed, offset for dropout mask.
        training(bool): Whether it is in the training phase.
        rng_name(str): The name to select Generator.
        name(str, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.seed(2023)
            >>> q = paddle.rand((1, 128, 2, 16))

            >>> output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
            >>> print(output)
            (Tensor(shape=[1, 128, 2, 16], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[[[0.34992966, 0.34456208, 0.45826620, ..., 0.39883569,
                0.42132431, 0.39157745],
               [0.76687670, 0.65837246, 0.69117945, ..., 0.82817286,
                0.76690865, 0.71485823]],
              ...,
              [[0.71662450, 0.57275224, 0.57053083, ..., 0.48108247,
                0.53336465, 0.54540104],
               [0.59137970, 0.51350880, 0.50449550, ..., 0.38860250,
                0.40526697, 0.60541755]]]]), None)

    r   r+   NqZinput_param_name)r5   kvr2   r(   r   softmax_lseseed_offsetr   r"   r#   is_testr3   typeinputsoutputsattrsr,   r   )memory_efficient_attention)Z	attn_biaspscaler   )r!   r"   r#   r   r+   )r   r0   r   r   r+   r   localsinput_dtype"create_variable_for_type_inferencer   float32int64	append_opZ-paddle.incubate.nn.memory_efficient_attentionrC   r)   )r   r   r    r   r"   r#   r2   r3   r   r4   r$   Zsdp_func_nameresult_attentionresult_softmax_helperdtyper(   r   r:   r;   r@   rA   rC   outputr	   r	   r
   flash_attention   s   
N

	rS   c                 C   s   t  r"t| |||||d|||||	|
| |\}}||
r|fS dfS td	i t }|jdd}||}||}|tj}|tj	}| |||||d}||||d}|j
d|||||||	|
| |dd ||
ro|fS dfS )
ag  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API is only support inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        3-D tensor with shape:
                        [total_seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index query.
        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
                        used to index key and value.
        max_seqlen_q(int): Maximum sequence length of query in the batch.
        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
        scale(float): The scaling of QK^T before applying softmax.
        dropout(float): The dropout ratio.
        causal(bool): Whether enable causal mode.
        return_softmax(bool): Whether to return softmax.
        fixed_seed_offset(Tensor, optional): With fixed seed, offset for dropout mask.
        rng_name(str): The name to select Generator.
        training(bool): Whether it is in the training phase.
        name(str, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.
        softmax(Tensor): The softmax tensor. None if return_softmax is False.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> paddle.seed(2023)
            >>> q = paddle.rand((2, 128, 8, 16), dtype='float16')
            >>> cu = paddle.arange(0, 384, 128, dtype='int32')
            >>> qq = paddle.reshape(q, [256, 8, 16])
            >>> output = paddle.nn.functional.flash_attention.flash_attn_unpadded(qq, qq, qq, cu, cu, 128, 128, 0.25, 0.0, False, False)

    Nflash_attn_unpaddedr5   r6   )r5   r7   r8   cu_seqlens_qcu_seqlens_kr2   r9   )max_seqlen_qmax_seqlen_krE   r   r"   r#   r=   r3   r>   )rT   )r   r   rT   r   rG   rH   rI   r   rJ   rK   rL   )r   r   r    rU   rV   rW   rX   rE   r   r"   r#   r2   r3   r   r4   rM   rN   rP   rQ   r(   r   r:   r;   r@   rA   r	   r	   r
   rT   *  sn   N

	rT   c                 C   s   |du rt | ||||\}}	|S t r.d}
d}d}t| |||
||||| |
\}}	}	}	|S tdi t }|jdd}||}||}|tj	}|tj
}| |||d}||||d	}|jd||||d| dd
d |S )a  
    The equation is:

    .. math::

        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V

    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
    The dimensions of the three parameters are the same.
    ``d`` represents the size of the last dimension of the three parameters.

    Warning:
        This API only supports inputs with dtype float16 and bfloat16.

    Args:
        query(Tensor): The query tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        key(Tensor): The key tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        value(Tensor): The value tensor in the Attention module.
                        4-D tensor with shape:
                        [batch_size, seq_len, num_heads, head_dim].
                        The dtype can be float61 or bfloat16.
        attn_mask(Tensor,optional): A float mask of the same type as query,
                        key, value that is added to the attention score.
        dropout_p(float): The dropout ratio.
        is_causal(bool): Whether enable causal mode.
        training(bool): Whether it is in the training phase.
        name(str, optional): The default value is None. Normally there is no need for user
                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.

    Returns:
        out(Tensor): The attention tensor.
                    4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
                    The dtype can be float16 or bfloat16.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('bfloat need V100 compile')
            >>> import paddle
            >>> q = paddle.rand((1, 128, 2, 16), dtype=paddle.bfloat16)
            >>> output = paddle.nn.functional.scaled_dot_product_attention(q, q, q, None, 0.9, False)
            >>> print(output)
            >>> # doctest: -SKIP
    N)NFr1   r+   r5   r6   )r5   r7   r8   	attn_maskr9   r<   r>   rF   )rS   r   r   r+   r   rG   rH   rI   r   rJ   rK   rL   )r   r   r    rY   Z	dropout_pZ	is_causalr   r4   r(   rO   r2   r#   r3   rP   rQ   r   r:   r;   r@   rA   r	   r	   r
   scaled_dot_product_attention  sh   >

rZ   )FTT)r   FFT)r   FF)r   FFNr1   TN)Nr   FTN)r   Zpaddle.nn.functionalnnZ
functionalr   r   r   Zpaddle.base.layer_helperr   Zpaddle.base.wrapped_decoratorr   r   r   r   r   r   r)   r-   r0   rS   rT   rZ   r	   r	   r	   r
   <module>   sV   
+' &
 