o
    "j                    @   s   d dl Zd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ dd Zdd ZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N)core)VarDesc)no_grad)convert_np_dtype_to_dtype_)in_dynamic_mode)
functional)Layer)Constant)_convert_attention_mask_convert_param_attr_to_listc                 C   sX   | d u rd S d| _ t s*tj  }tj  }d|| j_ d|| j_ d S d S )NT)	Zis_distributedr   paddleZstaticZdefault_startup_programZcurrent_blockZdefault_main_programZ_find_var_recursivename)varZstartup_blockZ
main_block r   k/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/incubate/nn/layer/fused_transformer.py_set_var_distributed    s   r   c           
      C   s  t | s| S t|tjurt|}| j rHt	|}t
| j| d d d d }t }||k rE| t  d}|     n| }n| }|d urs||jkrst jjj|jd |j|d}W d    n1 smw   Y  n|}|}|   }|  }	||	 | S )N      g333333?F)place)dtype)r   Zis_floating_pointtyper   ZVarTyper   r   Zis_gpu_placer   Zsize_of_dtypenpprodshapegpu_memory_availableZ_copy_toZCPUPlacevalueZ
get_tensor_clearr   baseZ	frameworkZ_dygraph_place_guardcastZ_share_data_with)
tr   Z
size_dtypeZwaiting_alloc_memoryr   Zt_usedZt_castedZnew_tZ
dst_tensorZ
src_tensorr   r   r   	_to_dtype.   s2   



r    c                       s<   e Zd ZdZ					d fdd	Zdd Zd	d
 Z  ZS )!FusedBiasDropoutResidualLayerNorma  
    Applies fused_bias_dropout_residual_layer_norm operation.

    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        dropout_rate (float, optional): The dropout probability used on attention
            weights to drop some attention targets for the dropout after attention.
            0 for no dropout. Default 0.5.
        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
            Default: None, which means the default bias parameter property is used.
            If it is set to False, this layer will not have trainable bias parameter.
            See usage for details in :code:`ParamAttr`.
        epsilon (float, optional): The small value added to the variance to prevent
            division by zero. Default: 1e-05.

    Examples:

        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> paddle.device.set_device('gpu')
            >>> # input: [batch_size, seq_len, embed_dim]
            >>> x = paddle.rand((2, 4, 128))
            >>> # residual: [batch_size, seq_len, embed_dim]
            >>> residual = paddle.rand((2, 4, 128))
            >>> fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128)
            >>> output = fused_bias_dropout_residual_ln(x, residual)
            >>> print(output.shape)
            [2, 4, 128]
          ?Nh㈵>c                    s   t    |dksJ d| | j | _|| _|| _|| _| j|g| j| jdd| _	| j| j|gt
ddd| _| j| j|gdd| _|| _|| _|| _d S )	Nr   6Expected embed_dim to be greater than 0, but received Tr   attrr   is_bias      ?r   r&   r   default_initializerr&   r   r'   )super__init___helperget_default_dtype_dtypeZ
_bias_attrZ_weight_attr	embed_dimcreate_parameterlinear_biasr	   ln_scaleln_biasdropout_rate_epsilonr   )selfr2   r7   weight_attr	bias_attrepsilonr   	__class__r   r   r.   t   s6   
	


z*FusedBiasDropoutResidualLayerNorm.__init__c                 C   s0   t j||| j| j| j| j| j| jd| jd
}|S )aU  
        Applies fused_bias_dropout_residual_layer_norm operation.

        Parameters:
            x (Tensor): The input tensor. It is a tensor with shape
                `[batch_size, seq_len, embed_dim]`. The data type should be
                float32 or float64.
            residual (Tensor, optional): The residual tensor. It is a tensor
                with shape `[batch_size, value_length, vdim]`. The data type
                should be float32 or float64.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type                 as `x`.
        upscale_in_train)
xresidualZbiasr5   r6   r7   
ln_epsilontrainingmoder   )	
incubate_fZ&fused_bias_dropout_residual_layer_normr4   r5   r6   r7   r8   rC   r   )r9   r@   rA   outr   r   r   forward   s   z)FusedBiasDropoutResidualLayerNorm.forwardc                 C   s4   | j r	d| j  nd}d| j| j| j| j| j|S )N, name= zAembed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{})r   formatr2   Zseq_lenr7   r8   r1   r9   Zname_strr   r   r   
extra_repr   s   z,FusedBiasDropoutResidualLayerNorm.extra_repr)r"   NNr#   N)__name__
__module____qualname____doc__r.   rG   rL   __classcell__r   r   r=   r   r!   S   s    #%r!   c                       sb   e Zd ZdZ																			d fdd		Zdd
dZdd Zdd Z  ZS )FusedMultiHeadAttentiona  
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.
    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
    for more details.

    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        num_heads (int): The number of heads in multi-head attention.
        dropout_rate (float, optional): The dropout probability used on attention
            weights to drop some attention targets for the dropout after attention.
            0 for no dropout. Default 0.5.
        attn_dropout_rate (float, optional): The dropout probability used on attention
            weights to drop some attention targets for the dropout in attention.
            0 for no dropout. Default 0.5.
        kdim (int, optional): The feature size in key. If None, assumed equal to
            `embed_dim`. Default None.
        vdim (int, optional): The feature size in value. If None, assumed equal to
            `embed_dim`. Default None.
        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm
            (True) or post_layer_norm architecture (False). Default False.
        need_weights (bool, optional): Indicate whether to return the attention
            weights. Now, only False is supported. Default False.
        qkv_weight_attr(ParamAttr, optional): To specify the weight parameter property
            for QKV projection computation. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        qkv_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for QKV projection computation. The `False` value means the corresponding layer
            would not have trainable bias parameter. Default: None, which means the
            default bias parameter property is used. See usage for details in :code:`ParamAttr`.
        linear_weight_attr(ParamAttr, optional): To specify the weight parameter property
            for linear projection computation. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        linear_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for linear projection computation. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        pre_ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
            for pre_layer_norm computation. Otherwise, all layers both use it as
            `attr` to create parameters. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        pre_ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for pre_layer_norm computation. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
            for post_layer_norm computation. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for post_layer_norm computation. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        epsilon (float, optional): The small value added to the variance to prevent
            division by zero. Default: 1e-05.
        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
        transpose_qkv_wb (bool, optional): Support input qkv matmul weight shape as
            [hidden_size, 3 * hidden_size] and qkv matmul bias shape as [3 * hidden_size].
            Will transpose the weight to [3, num_head, head_dim, hidden_size] and transpose bias to
            [3, num_head, hidden_size] in the fused_attention_op. Only support for GPU for now.
            The default value is False, which is not do transpose to qkv_w and qkv_b.
        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.

    Examples:

        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> paddle.device.set_device('gpu')
            >>> # input: [batch_size, sequence_length, embed_dim]
            >>> query = paddle.rand((2, 4, 128))
            >>> # self attention mask: [batch_size, num_heads, query_len, query_len]
            >>> attn_mask = paddle.rand((2, 2, 4, 4))
            >>> multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2)
            >>> output = multi_head_attn(query, None, None, attn_mask=attn_mask)
            >>> print(output.shape)
            [2, 4, 128]
    r"   NFr#   r   c                    s6  t    |dksJ d| |dksJ d| || _| j | _|| _|| _|| _|| _	|| | _
|| _|| _|| _| j
| |ksIJ d|du sQJ d|| dksYJ || | _	|| _| jrx|d| j	 | j
 g}d| j	 | j
 g}nd| j	| j
|g}d| j	| j
g}| j||	| jdd| _| j||
| jd	d| _| j| j	| j
 |g|| jdd| _| j|g|| jd	d| _|d
kr|dksJ t| j t| j t| j |r| j||gtddd| _| j||gd	d| _d | _d | _nd | _d | _| j||gtddd| _| j||gd	d| _|| _|| _|| _d S )Nr   r$   2Expected nhead to be greater than 0, but received (embed_dim must be divisible by num_headsFz&Only support need_weight is False now.   r%   Tr   rS   r(   r)   r*   r,   )r-   r.   normalize_beforer/   r0   r1   r8   _ring_idr2   	num_headshead_dimkdimvdimneed_weightstranspose_qkv_wbr3   
qkv_weightqkv_biaslinear_weightr4   r   r	   pre_ln_scalepre_ln_biasr5   r6   r7   attn_dropout_rater   )r9   r2   rY   r7   rd   r[   r\   rW   r]   qkv_weight_attrqkv_bias_attrlinear_weight_attrlinear_bias_attrpre_ln_scale_attrpre_ln_bias_attrln_scale_attrln_bias_attrr<   nranksring_idr^   r   Zqkv_wight_shapeZqkv_bias_shaper=   r   r   r.     s   








z FusedMultiHeadAttention.__init__c                 C   s   |dur
t ||j}tjdi d|d| jd| jd| jd| jd| jd| j	d	| j
d
| jd| jd| jd|d|d| jd| jd| jd| jd| jd| jd| jd| j}|S )a  
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor, optional): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`. Default None.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`. Default None.
            attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                Now, only None is supported. Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type                 as `query`, representing attention output.
        Nr@   r_   ra   pre_layer_normrb   rc   r5   r6   Zpre_ln_epsilonr`   r4   Zcache_kv	attn_maskr7   rd   rB   rC   rn   rY   r^   r   r   )r
   r   rE   Zfused_multi_head_attentionr_   ra   rW   rb   rc   r5   r6   r8   r`   r4   r7   rd   rC   rX   rY   r^   r   )r9   querykeyr   rp   cacherF   r   r   r   rG     s\   "
	
zFusedMultiHeadAttention.forwardc                 C   sH   | j r	d| j  nd}d| j| j| j| j| j| j| j| j	| j
| j|S )NrH   rI   zembed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{})r   rJ   r2   rY   r7   rd   r8   r[   r\   rW   r]   r1   rK   r   r   r   rL     s   z"FusedMultiHeadAttention.extra_reprc              	   C      g }| j r|t| j |t| j n|t| j |t| j | j D ]'\}}t||v r6q+|d urRt	  t
||}W d    n1 sMw   Y  q+|| _d S N)rW   appendidrb   rc   r5   r6   _parametersitemsr   r    r1   r9   r   Zlayer_norm_params_idrr   paramZparam_appliedr   r   r   _amp_decorate     
z%FusedMultiHeadAttention._amp_decorate)r"   r"   NNFFNNNNNNNNr#   r   rS   FN)NNNN	rM   rN   rO   rP   r.   rG   rL   r|   rQ   r   r   r=   r   rR      s2    U
}?rR   c                       s\   e Zd ZdZ																d fd	d
	ZdddZdd Zdd Z  ZS )FusedFeedForwarda  
    Parameters:
        d_model (int): The expected feature size in the input and output.
        dim_feedforward (int): The hidden layer size.
        dropout_rate (float, optional): The dropout probability used in pre-process
            and post-precess. Default 0.1
        epsilon (float, optional): he small value added to the variance to prevent
            division by zero. Default: 1e-05.
        activation (str, optional): The activation function. Default relu.
        act_dropout_rate (float, optional): The dropout probability after activition.
            If None, use the value of `dropout_rate`. Default None
        normalize_before (bool, optional): Indicate whether to put layer normalization
            into, preprocessing or postprocessing. Default False
        linear1_weight_attr(ParamAttr, optional): To specify the weight parameter property
            for FFN first linear. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        linear1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for FFN first linear. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        linear2_weight_attr(ParamAttr, optional): To specify the weight parameter property
            for FFN second linear. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        linear2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for FFN second linear. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ln1_scale_attr(ParamAttr, optional): To specify the weight parameter property
            for FFN pre_layer_norm. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ln1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for FFN pre_layer_norm. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ln2_scale_attr(ParamAttr, optional): To specify the weight parameter property
            for FFN post_layer_norm. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ln2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
            for FFN layer_norm. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
        name (str, optional): The default value is None.  Normally there is no need for user to set
            this property. For more information, please refer to :ref:`api_guide_Name`.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> from paddle.incubate.nn import FusedFeedForward
            >>> paddle.device.set_device('gpu')

            >>> fused_feedforward_layer = FusedFeedForward(8, 8)
            >>> x = paddle.rand((1, 8, 8))
            >>> out = fused_feedforward_layer(x)
            >>> print(out.shape)
            [1, 8, 8]
    皙?r#   reluNFr   rS   c                    s  t    |dksJ d| |dksJ d|| j | _|| _|| dks,J || }|| _|| _|d u r<|n|| _	|| _
|| _|| _|| _| j||g|| jdd| _| j|g|	| jdd| _| j||g|
| jdd| _| j|g|| jdd| _|dkr|dksJ t| j t| j t| j |r| j|g|dtd	d
| _| j|g|dd| _d | _d | _nd | _d | _| j|g|dtd	d
| _| j|g|dd| _|| _d S )Nr   4Expected d_model to be greater than 0, but received >Expected dim_feedforward to be greater than 0, but received {}Fr%   Tr   rS   r(   r   r&   r'   r+   r   r&   r'   )r-   r.   rJ   r/   r0   r1   _d_model_dim_feedforward_dropout_rate_act_dropout_rate_act_method_normalize_beforer8   rX   r3   _linear1_weight_linear1_bias_linear2_weight_linear2_biasr   r	   
_ln1_scale	_ln1_bias
_ln2_scale	_ln2_biasr   )r9   d_modeldim_feedforwardr7   r<   
activationact_dropout_raterW   linear1_weight_attrlinear1_bias_attrlinear2_weight_attrlinear2_bias_attrZln1_scale_attrZln1_bias_attrZln2_scale_attrZln2_bias_attrrm   rn   r   r=   r   r   r.   4  s   






zFusedFeedForward.__init__c                 C   sT   t j|| j| j| j| j| j| j| j| j	| j
| j| j| j| j| j| j| j| jd}|S )N)	Zdropout1_rateZdropout2_rater   Zln1_epsilonZln2_epsilonro   rC   rn   r   )rE   Zfused_feedforwardr   r   r   r   r   r   r   r   r   r   r   r8   r   rC   rX   r   )r9   srcrs   rF   r   r   r   rG     s*   zFusedFeedForward.forwardc                 C   s@   | j r	d| j  nd}d| j| j| j| j| j| j| j| j	|	S )NrH   rI   zd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{})
r   rJ   r   r   r   r8   r   r   r   r1   rK   r   r   r   rL     s   zFusedFeedForward.extra_reprc              	   C   rt   ru   )r   rv   rw   r   r   r   r   rx   ry   r   r    r1   rz   r   r   r   r|     r}   zFusedFeedForward._amp_decorate)r   r#   r   NFNNNNNNNNr   rS   Nru   r~   r   r   r=   r   r     s,    A
kr   c                       s:   e Zd ZdZ							d
 fdd	Zddd	Z  ZS )FusedTransformerEncoderLayera  

    FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
    attention and feedforward network. Before and after each sub-layer, pre-process
    and post-precess would be applied on the input and output accordingly. If
    `normalize_before` is True, pre-process is layer normalization and post-precess
    includes dropout, residual connection. Otherwise, no pre-process and post-precess
    includes dropout, residual connection, layer normalization.

    Parameters:
        d_model (int): The expected feature size in the input and output.
        nhead (int): The number of heads in multi-head attention(MHA).
        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
        dropout_rate (float, optional): The dropout probability used in pre-process
            and post-precess of MHA and FFN sub-layer. Default 0.1
        activation (str, optional): The activation function in the feedforward
            network. Default relu.
        attn_dropout_rate (float, optional): The dropout probability used
            in MHA to drop some attention target. If None, use the value of
            `dropout`. Default None
        act_dropout_rate (float, optional): The dropout probability used after FFN
            activition.  If None, use the value of `dropout`. Default None
        normalize_before (bool, optional): Indicate whether to put layer normalization
            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` .
        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
            The `False` value means the corresponding layer would not have trainable
            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
            which means the default bias parameter property is used.


    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> from paddle.incubate.nn import FusedTransformerEncoderLayer
            >>> paddle.device.set_device('gpu')

            >>> # encoder input: [batch_size, src_len, d_model]
            >>> enc_input = paddle.rand((2, 4, 128))
            >>> # self attention mask: [batch_size, n_head, src_len, src_len]
            >>> attn_mask = paddle.rand((2, 2, 4, 4))
            >>> encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
            >>> enc_output = encoder_layer(enc_input, attn_mask)
            >>> print(enc_output.shape)
            [2, 4, 128]

    r   r   NFc                    s"  t  | _| jd | jdd  t   |dks!J d| |dks,J d| |dks7J d| |d u r=|n|}|d u rE|n|}|| _t|	d}t|
d}t||||| j|d |d |d |d |d |d |d |d d| _t	|||||| j|d	 |d	 |d	 |d	 d

| _
d S )Nr9   r>   r   r   rT   z<Expected dim_feedforward to be greater than 0, but received    )r7   rd   rW   re   rf   rg   rh   ri   rj   rk   rl   r   )r7   r   r   rW   r   r   r   r   )locals_configpopr-   r.   rW   r   rR   
fused_attnr   ffn)r9   r   nheadr   r7   r   rd   r   rW   r:   r;   Zweight_attrsZ
bias_attrsr=   r   r   r.     sb   





z%FusedTransformerEncoderLayer.__init__c                 C   sV   t ||j}|du r| j||d}n
| j|||d\}}| |}|du r'|S ||fS )a  

        Applies a Transformer encoder layer on the input.

        Parameters:
            src (Tensor): The input of Transformer encoder layer. It is
                a tensor with shape `[batch_size, sequence_length, d_model]`.
                The data type should be float32 or float64.
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
                See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
                only used for inference and should be None for training. Default
                None.

        Returns:
            Tensor|tuple, It is a tensor that has the same shape and data type                 as `enc_input`, representing the output of Transformer encoder                 layer. Or a tuple if `cache` is not None, except for encoder                 layer output, the tuple includes the new cache which is same                 as input `cache` argument but `incremental_cache` has an                 incremental length. See `MultiHeadAttention.gen_cache` and                 `MultiHeadAttention.forward` for more details.

        N)rp   )rp   rs   )r
   r   r   r   )r9   r   src_maskrs   Zattn_outZincremental_cacheZffn_outr   r   r   rG   Z  s   "

z$FusedTransformerEncoderLayer.forward)r   r   NNFNN)NNrM   rN   rO   rP   r.   rG   rQ   r   r   r=   r   r     s    BDr   c                       sH   e Zd ZdZ															d fd
d	ZdddZ  ZS )FusedTransformera|  
    A Transformer model composed of an instance of `TransformerEncoder` and an
    instance of `TransformerDecoder`. While the embedding layer and output layer
    are not included.

    Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
    and see `TransformerEncoder` and `TransformerDecoder` for more details.

    Users can configurate the model architecture with corresponding parameters.
    Note the usage of `normalize_before` representing where to apply layer
    normalization (in pre-process or post-precess of multi-head attention or FFN),
    and some transformer like models are different on this, such as
    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ .
    The default architecture here places layer normalization in post-process and
    applies another layer normalization on the output of last encoder/decoder layer.

    Parameters:
        d_model (int, optional): The expected feature size in the encoder/decoder input
            and output. Default 512
        nhead (int, optional): The number of heads in multi-head attention(MHA). Default 8
        num_encoder_layers (int, optional): The number of layers in encoder. Default 6
        num_decoder_layers (int, optional): The number of layers in decoder. Default 6
        dim_feedforward (int, optional): The hidden layer size in the feedforward network(FFN). Default 2048
        dropout (float, optional): The dropout probability used in pre-process
            and post-precess of MHA and FFN sub-layer. Default 0.1
        activation (str, optional): The activation function in the feedforward
            network. Default relu.
        attn_dropout (float, optional): The dropout probability used
            in MHA to drop some attention target. If None, use the value of
            `dropout`. Default None
        act_dropout (float, optional): The dropout probability used after FFN
            activition.  If None, use the value of `dropout`. Default None
        normalize_before (bool, optional): Indicate whether to put layer normalization
            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3,
            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]`
            would be used as `weight_attr` for cross attention of `TransformerDecoder`,
            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN.
            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention
            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for
            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr`
            for self attention, cross attention and linear in FFN. Otherwise,
            the three sub-layers all uses it as `weight_attr` to create parameters.
            Default: None, which means the default weight parameter property is used.
            See usage for details
            in :code:`ParamAttr` .
        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3,
            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]`
            would be used as `bias_attr` for cross attention of `TransformerDecoder`,
            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN.
            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention
            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for
            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr`
            for self attention, cross attention and linear in FFN. Otherwise,
            the three sub-layers all uses it as `bias_attr` to create parameters.
            The `False` value means the corresponding layer would not have trainable
            bias parameter. See usage for details in :code:`ParamAttr` .
            Default: None,which means the default bias parameter property is used.
        custom_encoder (Layer, optional): If custom encoder is provided, use it as the encoder.
            Default None
        custom_decoder (Layer, optional): If custom decoder is provided, use it as the decoder.
            Default None

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.nn import Transformer

            >>> # src: [batch_size, tgt_len, d_model]
            >>> enc_input = paddle.rand((2, 4, 128))
            >>> # tgt: [batch_size, src_len, d_model]
            >>> dec_input = paddle.rand((2, 6, 128))
            >>> # src_mask: [batch_size, n_head, src_len, src_len]
            >>> enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
            >>> # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
            >>> dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
            >>> # memory_mask: [batch_size, n_head, tgt_len, src_len]
            >>> cross_attn_mask = paddle.rand((2, 2, 6, 4))
            >>> transformer = Transformer(128, 2, 4, 4, 512)
            >>> output = transformer(enc_input,
            ...                      dec_input,
            ...                      enc_self_attn_mask,
            ...                      dec_self_attn_mask,
            ...                      cross_attn_mask)
            >>> print(output.shape)
            [2, 6, 128]
                r   r   NFc                    s   t    t ru   )r-   r.   NotImplementedError)r9   r   r   Znum_encoder_layersZnum_decoder_layersr   Zdropoutr   Zattn_dropoutZact_dropoutrW   r:   r;   Zcustom_encoderZcustom_decoderr=   r   r   r.     s   
zFusedTransformer.__init__c                 C   s   t  ru   )r   )r9   r   Ztgtr   Ztgt_maskZmemory_maskr   r   r   rG     s   zFusedTransformer.forward)r   r   r   r   r   r   r   NNFNNNN)NNNr   r   r   r=   r   r     s$    ar   c                       sd   e Zd ZdZ																					d fd	d
	Z							dddZ  ZS )FusedMultiTransformeral,  
    FusedMultiTransformer is composed of multi transformer layers which contains two
    sub-layers which are self (multi-head) attention and feedforward network. The
    function of one transformer layer is consistent with the following pseudo code:

    .. code-block:: python

        >>> # doctest: +SKIP('This is not an example')
        >>> if pre_layer_norm:
        ...     out = layer_norm(x)
        ...     out = qkv_linear(out) + qkv_bias
        ... else:
        ...     out = qkv_linear(x) + qkv_bias
        >>> out = transpose(out, perm=[2, 0, 3, 1, 4])
        >>> # extract q, k and v from out.
        >>> q = out[0:1, ::]
        >>> k = out[1:2, ::]
        >>> v = out[2:3, ::]
        >>> out = q * k^t
        >>> out = attn_mask + out
        >>> out = softmax(out)
        >>> out = dropout(out)
        >>> out = out * v
        >>> out = transpose(out, perm=[0, 2, 1, 3])
        >>> out = linear(out)
        >>> if pre_layer_norm:
        ...     out = x + dropout(out + bias)
        ... else:
        ...     out = layer_norm(x + dropout(out + bias))

        >>> residual = out;
        >>> if pre_layer_norm:
        ...     out = ffn_layer_norm(out)
        >>> out = ffn1_linear(out)
        >>> out = dropout(activation(out + ffn1_bias))
        >>> out = ffn2_linear(out)
        >>> out = residual + dropout(out + ffn2_bias)
        >>> if not pre_layer_norm:
        ...     out = ffn_layer_norm(out)

    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        num_heads (int): The number of heads in multi-head attention(MHA).
        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
        dropout_rate (float, optional): The dropout probability used in pre-process
            and post-precess of MHA and FFN sub-layer. Default 0.0
        activation (str, optional): The activation function in the feedforward
            network. Default "gelu".
        normalize_before (bool, optional): Indicate whether to put layer normalization
            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default True
        ln_scale_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
            for Attention layer_norm. For Attention layer_norm weight, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ln_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
            for Attention layer_norm. For Attention layer_norm bias, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        qkv_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
            for Attention qkv computation. For Attention qkv weight, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        qkv_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
            for Attention qkv computation. For Attention qkv bias, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        linear_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
            for Attention linear. For Attention linear weight, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        linear_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
            for Attention linear computation. For Attention linear bias, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ffn_ln_scale_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
            for FFN layer_norm. For FFN layer_norm weight, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ffn_ln_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
            for FFN layer_norm. For FFN layer_norm bias, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ffn1_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
            for FFN first linear. For FFN first linear weight, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ffn1_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
            for FFN first linear. For FFN first linear bias, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ffn2_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
            for FFN second linear. For FFN second linear weight, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. Default: None, which means the default weight
            parameter property is used. See usage for details in :code:`ParamAttr`.
        ffn2_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
            for FFN second linear. For FFN second linear bias, if it is a list/tuple, `attrs[0]`
            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
            `attr` for transformer layer 1, etc. Otherwise, all layers both use it as
            `attr` to create parameters. The `False` value means the corresponding layer would
            not have trainable bias parameter. Default: None, which means the default bias
            parameter property is used. See usage for details in :code:`ParamAttr`.
        epsilon (float, optional): Small float value added to denominator of the layer_norm to
            avoid dividing by zero. Default: 1e-05.
        num_layers (int, optional): The number of layers of the transformer. If `qkv_weight_attrs`
            is a list or tuple, the number of layers is obtained from `qkv_weight_attrs`. num_layers
            only takes effect when `qkv_weight_attrs` is not a list or tuple. Default: -1.
        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using mp.
        trans_qkvw (bool, optional): Whether to transpose for weights of qkv.
            If true, the shape eights of qkv should be [3, num_head, dim_head, dim_embed].
            Otherwise the shape of weights of qkv should be [dim_embed, 3, num_head, dim_head]. Default: True.
        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using mp.
        name (str, optional): The default value is None.  Normally there is no need for user to set
            this property. For more information, please refer to :ref:`api_guide_Name`.

    Examples:

        .. code-block:: python

            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> from paddle.incubate.nn import FusedMultiTransformer
            >>> paddle.device.set_device('gpu')

            >>> # encoder input: [batch_size, src_len, d_model]
            >>> enc_input = paddle.rand((2, 4, 128))
            >>> # self attention mask: [batch_size, 1, src_len, src_len]
            >>> attn_mask = paddle.rand((2, 1, 4, 4))
            >>> encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1)
            >>> enc_output = encoder_layers(enc_input, attn_mask)
            >>> print(enc_output.shape)
            [2, 4, 128]
            geluTNr#   rS   r   c           3         s  t    |dksJ d| |dksJ d| |dks&J d||| _| j | _|| _|| _|| _	|| _
|| _|| | _| j| |ksNJ d|dkrX|dksXJ || dks`J || dkshJ || }|| }|| _t|	ttfr~t|	  dksJ g g | _| _g g | _| _g g | _| _g g | _| _g g | _| _g g | _| _ fdd	}t D ]<}|||}|||}||	|}||
|}|||}|||} |||}!|||}"|||}#|||}$|||}%|||}&| j||gt d
dd}'| j||gdd}(| j|rd|| j|gn|d|| jg|| jdd})| jd|| jg|| jdd}*| j|| j |g|| jdd}+| j|g| | jdd},| j|g|!dt d
d}-| j|g|"dd}.| j||g|#| jdd}/| j|g|$| jdd}0| j||g|%| jdd}1| j|g|&| jdd}2|dkrt!|) t!|* t!|/ t!|0 t!|+ t!|1 | j"|' | j"|( | j"|) | j"|* | j"|+ | j"|, | j"|- | j"|. | j"|/ | j"|0 | j"|1 | j"|2 q|| _#|| _$|| _%d S )Nr   r$   rT   r   rU   r   rS   c                    s*   t | ttfrt|  ksJ | | S | S ru   )
isinstancelisttuplelen)attrsidx
num_layersr   r   get_attr  s   z0FusedMultiTransformer.__init__.<locals>.get_attrr(   r)   r*   Tr,   rV   Fr%   r   r   )&r-   r.   rJ   rW   r/   r0   r1   r8   _trans_qkvwrX   r2   rY   rZ   r   r   r   r   r   	ln_scales	ln_biasesqkv_weights
qkv_biaseslinear_weightslinear_biasesffn_ln_scalesffn_ln_biasesffn1_weightsffn1_biasesffn2_weightsffn2_biasesranger3   r	   r   rv   r7   r   r   )3r9   r2   rY   r   r7   r   rW   Zln_scale_attrsZln_bias_attrsZqkv_weight_attrsZqkv_bias_attrsZlinear_weight_attrsZlinear_bias_attrsZffn_ln_scale_attrsZffn_ln_bias_attrsZffn1_weight_attrsZffn1_bias_attrsZffn2_weight_attrsZffn2_bias_attrsr<   r   rm   
trans_qkvwrn   r   r   irk   rl   re   rf   rg   rh   Zffn_ln_scale_attrZffn_ln_bias_attrZffn1_weight_attrZffn1_bias_attrZffn2_weight_attrZffn2_bias_attrr5   r6   r_   r`   ra   r4   Zffn_ln_scaleZffn_ln_biasZffn1_weightZ	ffn1_biasZffn2_weightZ	ffn2_biasr=   r   r   r.     s   



















zFusedMultiTransformer.__init__r   c	           
      C   s   |durt |t | jksJ tj|| j| j| j| j| j| j| j	| j
| j| j| j| jfi d| jd| jd|d|d|d|d|d	|d
| jd|d| jd| jddd| jd| jd| j}	|	S )am	  
        Applies multi transformer layers on the input.

        Parameters:
            src (Tensor): The input of Transformer layers. It is
                a tensor with shape `[batch_size, sequence_length, d_model]`.
                The data type should be float16 or float32.
            attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                `[batch_size, 1, sequence_length, sequence_length]`. It can be
                None when nothing wanted or needed to be prevented attention to.
                Default None.
            caches (list(Tensor)|tuple(Tensor), optional): The cache structure
                tensors for the inference generation model. It is only used for
                inference and should be None for training. The shape is
                `[2, batch_size, num_head, max_seq_len, head_dim]`. Default None.
            pre_caches (list(Tensor)|tuple(Tensor), optional): The prefix caches
                for the generation model. The shape is `[2, bsz, num\_head, cache\_len, head\_dim]`. Default None.
            rotary_embs (Tensor optional): The RoPE embs for the rotary computation. The shape is `[2, bsz, 1, seq\_len, head\_dim]`. Default None.
            rotary_emb_dims (int, optional): The rotary_emb_dims of rotary computation, and it is 0 when rotary_embs is None,
                1 when rotary_embs is not None and pos_extra_ids is None, 2 when rotary_embs and pos_extra_ids are both not None. Default 0.
            seq_lens (Tensor optional): The sequence lengths of this batch. The shape is `[bsz]`. Default None.
            time_step (Tensor, optional): The time step tensor for the generation
                model. Which used in decode stage, to represent the time step,
                that is, the real seq_len of CacheKV. The shape is `[1]`, must be
                in CPUPlace. Default None.

        Returns:
            Tensor|tuple: If `caches` is None, return a tensor that has
            the same shape and data type with `src`, representing the output
            of Transformer layers. If `caches` is not None, return the
            tuple (output, caches), which output is the output of
            Transformer layers, caches is inplace with input `caches`.
        Nro   r<   Z	cache_kvs
pre_cachesrotary_embs	time_stepseq_lensrp   r7   rotary_emb_dimsr   rC   rD   r?   r   rn   r   )r   r   rE   Zfused_multi_transformerr   r   r   r   r   r   r   r   r   r   r   rW   r8   r7   r   rC   r   rX   r   )
r9   r   rp   cachesr   r   r   r   r   rF   r   r   r   rG   g  sd   /zFusedMultiTransformer.forward)r   r   TNNNNNNNNNNNNr#   rS   r   TrS   N)NNNNr   NNr   r   r   r=   r   r     sD     ) Fr   )numpyr   r   Zpaddle.baser   Zpaddle.base.corer   Zpaddle.base.dygraphr   Zpaddle.base.frameworkr   Zpaddle.frameworkr   Zpaddle.incubate.nnr   rE   Z	paddle.nnr   Zpaddle.nn.initializerr	   Zpaddle.nn.layer.transformerr
   r   r   r    r!   rR   r   r   r   r   r   r   r   r   <module>   s.   %q  4 c 2x