o
    "js                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ g Zdd Zdd ZG dd dejjZdaG dd dejjZG dd dejjZ G dd deZ!G dd dejjZ"G dd dejjZ#dS )    N)PyLayer)core)fleet)
functional   )ReduceOp_get_reduce_op   )topology)logger   )mp_ops)get_rng_state_trackerc                   C   s   t tjjjdS )Nfused_gemm_epilogue)hasattrr   eageropslegacy r   r   n/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/fleet/layers/mpu/mp_layers.pyis_fused_matmul_bias_supported$   s   r   c                   C   s    t  rt  stt jdS dS )Nfused_linear_param_grad_addF)paddleZis_compiled_with_cudaZis_compiled_with_rocmr   _C_opsr   r   r   r   (is_fused_linear_param_grad_add_supported(   s   r   c                       s0   e Zd ZdZ			d fdd	Zdd Z  ZS )VocabParallelEmbeddinga
  Embedding mp parallelized in the vocabulary dimension.
    this class is used for splitting embedding in mp group.

    Args:
        num_embeddings(int): One element which indicate the size of the dictionary of embeddings.
        embedding_dim(int): One element which indicate the size of each embedding vector respectively.
        weight_attr(ParamAttr|None): To specify the weight parameter property. Default: None, which means the
            default weight parameter property is used. See usage for details in :ref:`api_paddle_ParamAttr` . In addition,
            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
            The local word vector needs to be transformed into numpy format, and the shape of local word
            vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_paddle_nn_initializer_Assign`
            is used to load custom or pre-trained word vectors. See code example for details.
        mp_group(Group): The tensor parallel group.
        name(str, optional): For detailed information, please refer
               to :ref:`api_guide_Name`. Usually name is no need to set and
               None by default.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> from paddle.distributed import fleet

            >>> class SimpleMPNet(paddle.nn.Layer):
            ...     def __init__(self, vocab_size, hidden_size, inner_size, output_size):
            ...         super().__init__()
            ...         self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
            ...             hidden_size,
            ...             inner_size,
            ...             gather_output=False,
            ...             has_bias=True)
            ...         self.linear2 = fleet.meta_parallel.RowParallelLinear(
            ...             inner_size,
            ...             hidden_size,
            ...             input_is_parallel=True,
            ...             has_bias=True)
            ...         self.linear3 = paddle.nn.Linear(hidden_size, output_size)
            ...         self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
            ...                         vocab_size,
            ...                         hidden_size)
            ...     def forward(self, x):
            ...         x = self.embedding(x)
            ...         x = self.linear1(x)
            ...         x = self.linear2(x)
            ...         x = self.linear3(x)
            ...         return x

    Nc                    sT  t    |d u rtj n|| _|d u rtj n|j| _|d u r'tj	 n|j
| _
|| _| jdk| _|| j dks?J d|| j }| j
| | _| j | _||g| _|| _|| _|| _| jrt rt   | j| j| j| jdd| _W d    n1 sw   Y  n| j| j| j| jdd| _| jrdnd| j_| jjrd| j_d S d S )Nr   r   zNThe length of the vocabulary must be divisible by the parallelism degree of MPF)attrshapedtypeis_biasT)super__init__tp_HYBRID_PARALLEL_GROUPget_model_parallel_groupmodel_parallel_groupget_model_parallel_world_sizenranks
world_sizeget_model_parallel_rankrankZorigin_num_embeddingsis_mpvocab_start_index_helperget_default_dtype_dtype_size_weight_attr_namenum_embeddingsr   in_dynamic_moder   	rng_statecreate_parameterweightis_distributed
split_axis)selfr3   Zembedding_dimweight_attrmp_groupnameZper_part_size	__class__r   r   r!   a   sZ   






zVocabParallelEmbedding.__init__c                 C   sV   | j rtj| j|| j| j| jd}tj|| jddd}|S t	j
|| jd d| jd}|S )N)start_indexZ
vocab_sizer=   T)groupuse_calc_streamuse_model_parallelF)r7   Zpadding_idxsparser=   )r+   r   Z_c_lookup_tabler7   r,   r3   r2   _mp_allreducer%   FZ	embedding)r:   xoutput_paralleloutputr   r   r   forward   s.   zVocabParallelEmbedding.forward)NNN__name__
__module____qualname____doc__r!   rJ   __classcell__r   r   r>   r   r   /   s    5>r   Tc                   @   $   e Zd Zedd Zedd ZdS )InnerOverlapLinearc	           	   	   C   s`   |  ||| || _|| _|du rtj|ddd|jdd}|s(tj|||S tj	|||S )NFrB   TZring_idrC   )
Zsave_for_backwardr%   mp_fused_linear_param_grad_addr   Z_legacy_C_opsZ
c_identityidr   linearr   )	ctxrG   r7   biasfuse_matmul_biasmp_async_allreducemp_skip_c_identityrS   r%   r   r   r   rJ      s    	zInnerOverlapLinear.forwardc                 C   s  |   \}}}|j|jkrtj||dd}ntj|tj||jddd}ttjd}| jj	j
||dd}ttddd	krMtrEtd
 datdg}| jr	t sXtd|d u rt|drxtj|||jd dd\|_}	|  |d fS |jd urtj|||jd dd\|_}	|  |d fS tj||d d dd\}
}	|  ||
fS t|drt|drtj|||j|jdd\|_|_|  |d d fS |jd ur|jd usJ tj|||j|jdd\|_|_|  |d d fS tj||d d dd\}
}|  ||
|fS |d|jd g}tj|d|jd g|dd}
|d u r0|  ||
fS tj|dd}|  ||
|fS )NT)Ztranspose_y)r   _c_identityF)Zsync_opZCUDA_DEVICE_MAX_CONNECTIONS0r   zYou set mp_async_allreduce=True, but you forget to set environment variable CUDA_DEVICE_MAX_CONNECTIONS=1, which may leads to performance loss. Try to export CUDA_DEVICE_MAX_CONNECTIONS=1 for better performance.i   zYou set mp_fused_linear_param_grad_add=True, however, the paddle you are using not support this operation. Please unset fused_linear_param_grad_add or use paddle compiled with cuda 11.6 or higher.	main_grad)Ztranspose_xr   )Zaxis)Zsaved_tensorr   r   matmulcastr   r   ZSUMr%   Zprocess_groupZ
all_reduceintosgetenv_raise_cuda_env_unset_warningr   warningZonesrS   r   NotImplementedErrorr   r   r   r]   waitZgradZreshaper   sum)rV   ZdyrG   r7   rW   ZdxZop_typetasktmp_ZdwZdbiasr   r   r   backward   s   

	





zInnerOverlapLinear.backwardNrL   rM   rN   staticmethodrJ   rl   r   r   r   r   rR      s
    
rR   c                       s6   e Zd ZdZ						d	 fdd	Zdd Z  ZS )
ColumnParallelLineara  Linear layer with mp parallelized(column).
    this class is used for splitting Linear Layer in mp group, column split the weight of the Linear layer.

    Args:
        in_features(int): The number of input units.
        out_features(int): The number of output units.
        weight_attr(ParamAttr|None): The attribute for the learnable weight of this layer. The default value is None
            and the weight will be initialized to zero. For detailed information, please refer to paddle.ParamAttr.
        has_bias(bool): whether to add bias.
        gather_output(bool): whether to do allgahter for the output of each rank.
        fuse_matmul_bias(bool): whether to fuse matmul and bias.
        mp_group(Group): The tensor parallel group.
        name(str, optional): Normally there is no need for user to set this parameter.
            For detailed information, please refer to :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> from paddle.distributed import fleet

            >>> class SimpleMPNet(paddle.nn.Layer):
            ...     def __init__(self, vocab_size, hidden_size, inner_size, output_size):
            ...         super().__init__()
            ...         self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
            ...             hidden_size,
            ...             inner_size,
            ...             gather_output=False,
            ...             has_bias=True)
            ...         self.linear2 = fleet.meta_parallel.RowParallelLinear(
            ...             inner_size,
            ...             hidden_size,
            ...             input_is_parallel=True,
            ...             has_bias=True)
            ...         self.linear3 = paddle.nn.Linear(hidden_size, output_size)
            ...         self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
            ...                         vocab_size,
            ...                         hidden_size)
            ...     def forward(self, x):
            ...         x = self.embedding(x)
            ...         x = self.linear1(x)
            ...         x = self.linear2(x)
            ...         x = self.linear3(x)
            ...         return x
    NTFc	                    s*  t    |d u rtj n|| _|d u rtj n|j| _|| _	| jdk| _
|| _|| j dks=J d| d| j d|| j | _|| _| j | _| j
rxt rxt   | j|| jg| j| jdd| _W d    n1 srw   Y  n| j|| jg| j| jdd| _| j
rdnd| j_| jjrd| j_|r| j| jgtjjjd	d
| jdd| _| j
rdnd| j_| jjrd| j_nd | _tj| _|| _t j j!j"d }	| j
o|	j#| _#| j
o|	j#o|	j$| _$| j
o|	j#o|	j%| _%| j#s| j$s| j%rt sJ d| jrt& st'dddl(m)}
 |
| _d S d S )Nr   r   z+Number of column of the weight for linear (,) must be divisible by model parallel size ()Fr   r   r   r   T        value
mp_configsomp_async_allreduce, mp_skip_c_identity and mp_fused_linear_param_grad_add are only available under dygraph modezYou set fuse_matmul_bias=True in ColumnParallelLinear, however, the paddle you are using not support this operation. Please set fuse_matmul_bias=False or use paddle compiled with cuda 11.6 or higher.fused_linear)*r    r!   r"   r#   r$   r%   r&   r'   r(   r2   r+   gather_outputZoutput_size_per_partitionr1   r-   r.   r/   r   r4   r   r5   r6   r7   r8   r9   nninitializerConstantrW   rF   rU   rX   r   _user_defined_strategyhybrid_configsrY   rZ   rS   r   rf   paddle.incubate.nn.functionalry   )r:   in_featuresout_featuresr;   has_biasrz   rX   r<   r=   rv   ry   r>   r   r   r!   |  s   




zColumnParallelLinear.__init__c                    s|    fdd} j r| }n jrtj j jd}n} j| j j j	d} j
r: jr:tj| jd}|S |}|S )Nc                
      s&   t  j j j j j j jS N)	rR   applyr7   rW   rX   rY   rZ   rS   r%   r   r:   rG   r   r   _overlap_linear  s   z5ColumnParallelLinear.forward.<locals>._overlap_linear)rA   skip_c_identity_dynamicr=   rA   )rY   r+   r   r[   r%   rZ   rU   r7   rW   r2   rz   Z	_c_concat)r:   rG   r   rH   input_parallelrI   r   r   r   rJ     s(   zColumnParallelLinear.forward)NNTFNNrK   r   r   r>   r   ro   M  s    2mro   c                   @   rQ   )MPScalec                 C   s   t |d| }|S )Ng      ?)r   scale)rV   rG   Z	mp_degreeoutr   r   r   rJ     s   zMPScale.forwardc                 C   s   |S r   r   )rV   Zdoutr   r   r   rl     s   zMPScale.backwardNrm   r   r   r   r   r     s
    
r   c                       s6   e Zd ZdZ						d	 fdd	Zdd Z  ZS )
RowParallelLineara  Linear layer with mp parallelized(row).
    this class is used for splitting Linear Layer in mp group, row split the weight of the Linear layer.

    Args:
        in_features(int): The number of input units.
        out_features(int): The number of output units.
        weight_attr(ParamAttr|None): The attribute for the learnable weight of this layer. The default value is None
            and the weight will be initialized to zero. For detailed information, please refer to paddle.ParamAttr.
        has_bias(bool): whether to add bias.
        input_is_parallel(bool): whether the input has alreadly been splitted across the mp group.
        fuse_matmul_bias(bool): whether to fuse matmul and bias.
        mp_group(Group): The tensor parallel group.
        name(str, optional): Normally there is no need for user to set this parameter.
            For detailed information, please refer to :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> from paddle.distributed import fleet

            >>> class SimpleMPNet(paddle.nn.Layer):
            ...     def __init__(self, vocab_size, hidden_size, inner_size, output_size):
            ...         super().__init__()
            ...         self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
            ...             hidden_size,
            ...             inner_size,
            ...             gather_output=False,
            ...             has_bias=True)
            ...         self.linear2 = fleet.meta_parallel.RowParallelLinear(
            ...             inner_size,
            ...             hidden_size,
            ...             input_is_parallel=True,
            ...             has_bias=True)
            ...         self.linear3 = paddle.nn.Linear(hidden_size, output_size)
            ...         self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
            ...                         vocab_size,
            ...                         hidden_size)
            ...     def forward(self, x):
            ...         x = self.embedding(x)
            ...         x = self.linear1(x)
            ...         x = self.linear2(x)
            ...         x = self.linear3(x)
            ...         return x

    NTFc	                    s,  t    || _|| _|| _|| _| j | _|| _	|d u r#t
j n|| _|d u r/t
j n|j| _|d u r<t
j n|j| _| jdk| _tjjjd }	| joR|	j| _| jo\|	jo\|	j| _| jof|	jof|	j| _| jsq| jsq| jryt syJ d|| j dksJ d| d| j d|| j | _| jrt rt   | j| j| jg| j| jdd	| _ W d    n1 sw   Y  n| j| j| jg| j| jdd	| _ | jrd
nd| j _!| j j!rd| j _"|r| j| jgtj#j$j%dd| jd
d	| _&nd | _&t'j(| _(|rt) st*dddl+m,}
 |
| _(|| _-d S )Nr   rv   rw   r   z(Number of row of the weight for linear (rp   rq   Frr   Trs   rt   zYou set fuse_matmul_bias=True in RowParallelLinear, however, the paddle you are using not support this operation. Please set fuse_matmul_bias=False or use paddle compiled with cuda 11.6 or higher.rx   ).r    r!   r   r   input_is_parallelr1   r-   r.   r/   r2   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r   r~   r   rY   rZ   rS   r   r4   Zinput_size_per_partitionr   r5   r6   r7   r8   r9   r{   r|   r}   rW   rF   rU   r   rf   r   ry   rX   )r:   r   r   r;   r   r   rX   r<   r=   rv   ry   r>   r   r   r!   L  s   








zRowParallelLinear.__init__c                 C   s   | j s| js	|}ntj|| jd}| jr\| jr8t| j| j	}| j
|| j|| jd}tj|| jdd| jd}|S | j
|| j| jd}tj|| jdd| jd}| jd urX|| j n|}|S | j
|| j| j| jd}|S )Nr   r   T)rA   rB   rC   r   )r   r+   r   Z_c_splitr%   rX   r   r   rW   r(   rU   r7   r2   rE   rZ   )r:   rG   r   rW   rH   rI   Zoutput_r   r   r   rJ     sD   
zRowParallelLinear.forward)NTFFNNrK   r   r   r>   r   r     s    3or   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	ParallelCrossEntropyaT  CrossEntropy with mp parallelized.
    this class is used for splitting softmax cross entropy in mp group.

    Args:
        mp_group(Group): The tensor parallel group.
        name(str, optional): Normally there is no need for user to set this parameter.
            For detailed information, please refer to :ref:`api_guide_Name` .
        ignore_index (long int, optional):  Specifies a target value that is ignored and
            does not contribute to the loss. A negative value means that no label value
            needs to be ignored. Default is -100 .

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No img to demonstrate')
            >>> from paddle.distributed.fleet.layers.mpu import ParallelCrossEntropy
            >>> loss_func = ParallelCrossEntropy
            >>> loss = loss_func(img, lable)

    Nc                    sf   t    || _|d u rtj n|| _|d u rtj n|j| _	|d u r*tj
 n|j| _|| _d S r   )r    r!   r=   r"   r#   r$   r%   r&   r'   r(   r)   r*   ignore_index)r:   r<   r=   r   r>   r   r   r!     s   




zParallelCrossEntropy.__init__c                 C   s   t j||| j| jd}|S )N)rA   r   )r   Z_c_softmax_with_cross_entropyr%   r   )r:   inputlabelZlossr   r   r   rJ     s   zParallelCrossEntropy.forward)NNr   rK   r   r   r>   r   r     s    r   )$rb   r   Zpaddle.autogradr   Zpaddle.baser   Zpaddle.distributedr   Z	paddle.nnr   rF   Zcommunication.reducer   r   baser
   r"   Zutils.log_utilr    r   randomr   __all__r   r   r{   ZLayerr   rd   ZautogradrR   ro   r   r   r   r   r   r   r   <module>   s2      E J