o
    "j                     @   sN   d dl mZ d dlmZmZ 												 							dd
dZdS )    )_C_ops)LayerHelperin_dynamic_modeN   Fdefault     _@     _c                 C   s  t  rt| |||||||||	|
||||||||S tdi t }| jdkrB|dkr.d}n|dkr5d}n|dkr;d}|j|d	}n|j| jd	}i }| |d
< ||d< |dur[||d< |durc||d< |durk||d< |durs||d< |dur{||d< d}|dur||d< d}n|jdd	}|dur||d< |	dur|	|d< |
dur|
|d< |||d}|jd||||||||||dd |dur|||fS ||fS )a  
    Masked Multi-head attention for text summarization.
    This is a fusion operator to compute masked multihead attention in transformer model architecture.
    This operator only supports running on GPU.

    Args:
        x (Tensor): The input tensor could be 2-D tensor. Its shape is [batch_size, 3 * num_head * head_dim].
        cache_kvs (list(Tensor)|tuple(Tensor)): The cache structure tensors for the generation model. Its shape is [2, batch_size, num_head, max_seq_len, head_dim].
        bias (Tensor, optional): The bias tensor. Its shape is [3, num_head, head_dim].
        src_mask (Tensor, optional): The src_mask tensor. Its shape is [batch_size, 1, 1, sequence_length].
        sequence_lengths (Tensor, optional): The sequence_lengths tensor, used to index input. Its shape is [batch_size, 1].
        rotary_tensor (Tensor, optional): The rotary_tensor tensor. The dtype must be float. Its shape is [batch_size, 1, 1, sequence_length, head_dim].
        beam_cache_offset (Tensor, optional): The beam_cache_offset tensor. Its shape is [batch_size, beam_size, max_seq_len + max_dec_len].
        qkv_out_scale (Tensor, optional): The qkv_out_scale tensor, used in quant. Its shape is [3, num_head, head_dim].
        out_shift (Tensor, optional): The out_shift tensor, used in quant.
        out_smooth (Tensor, optional): The out_smooth tensor, used in quant.
        seq_len (int, optional): The seq_len, used to get input length. Default 1.
        rotary_emb_dims (int, optional): The rotary_emb_dims. Default 1.
        use_neox_rotary_style (bool, optional): A flag indicating whether neox_rotary_style is needed or not. Default False.
        compute_dtype (string): A compute dtype, used to represent the input data type.
        out_scale (float, optional): The out_scale, used in quant.
        quant_round_type (int, optional): The quant_round_type, used in quant. Default 1.
        quant_max_bound (float, optional): The quant_max_bound, used in quant. Default 127.0.
        quant_min_bound (float, optional): The quant_min_bound, used in quant. Default -127.0.

    Returns:
        Tensor|tuple: If "beam_cache_offset_out" is not none, return the
        tuple (output, cache_kvs_out, beam_cache_offset_out), which output is the output of
        masked_multihead_attention layers, cache_kvs_out is inplace with input `cache_kvs`.
        If "beam_cache_offset_out" is none, return the tuple (output, cache_kvs_out).

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> import paddle.incubate.nn.functional as F
            >>> paddle.device.set_device('gpu')

            >>> # input: [batch_size, 3 * num_head * dim_head]
            >>> x = paddle.rand(shape=(2, 3 * 32 * 128), dtype="float32")

            >>> # src_mask: [batch_size, 1, 1, sequence_length]
            >>> src_mask = paddle.rand(shape=(2, 1, 1, 10), dtype="float32")

            >>> # cache_kv: [2, batch_size, num_head, max_seq_len, dim_head]
            >>> cache_kv = paddle.rand(shape=(2, 2, 32, 64, 128), dtype="float32")

            >>> output = F.masked_multihead_attention(
            ...     x, src_mask=src_mask, cache_kv=cache_kv)

    masked_multihead_attentionZint32Zbf16Zuint16Zfp16Zfloat16Zfp32Zfloat32)dtypexcache_kvNbiassrc_maskcum_offsetssequence_lengthsrotary_tensorFbeam_cache_offsetTintqkv_out_scale	out_shift
out_smooth)outZcache_kv_outZbeam_cache_offset_out)seq_lenrotary_emb_dimsuse_neox_rotary_stylecompute_dtype	out_scalequant_round_typequant_max_boundquant_min_bound)typeinputsoutputsattrs)r
   )r   r   Zmasked_multihead_attention_r   localsr   Z"create_variable_for_type_inferenceZ	append_op)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    helperr   r   r"   Zbeam_cache_offset_flagr#    r'   y/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/incubate/nn/functional/masked_multihead_attention.pyr
      s   J
r
   )NNNNNNNNNNr   r   Fr   r   r   r   r	   )Zpaddler   Zpaddle.frameworkr   r   r
   r'   r'   r'   r(   <module>   s*   