o
    "jw:                     @   s  d dl Z d dl mZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d	d
 Zdd Zdd ZG dd deZG dd deZG dd deZG dd deZdd Zdd Zdd Zdd Zdd  Zd!d" ZG d#d$ d$eZG d%d& d&eZG d'd( d(eZ dS ))    N)distributed)PyLayer)core)fleet)get_rng_state_tracker$fused_allreduce_gradients_with_group)Layer
functionalc                 C   sr   t  }| }|j}|j}| jd }|| dks!J d|||| }tj| dg|| g||d  gd} | S )Nr   LInput sequence length {} can't be divided exactly by sequence parallelism {}   )ZaxesZstartsZends)	r   get_hybrid_communicate_groupget_model_parallel_groupnranksrankshapeformatpaddleslice)inputhcggroupparallelismr   Zseq_leninterval r   w/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/fleet/utils/sequence_parallel_utils.pyscatter&   s   
r   c                 C   sR   t  }| }|j}| j}|d | |d< tj|| jd}|j	| |
  |S )Nr   r   dtype)r   r   r   r   r   r   emptyr   process_group
all_gatherwaitr   r   r   r   Zoutput_shapeoutputr   r   r   r!   8   s   r!   c                 C   s   t  }| }|j}| j}| jd | dks"J d| jd ||d | |d< tj|| jd}t	j
j|| t	jj|dd |S )Nr   r   r   T)opr   Zsync_op)r   r   r   r   r   r   r   r   r   diststreamreduce_scatterZReduceOpZSUMr#   r   r   r   r(   C   s   
r(   c                   @   $   e Zd Zedd Zedd ZdS )	ScatterOpc                 C      t |S Nr   ctxr   r   r   r   forwardX      zScatterOp.forwardc                 C   r+   r,   r!   r/   gradr   r   r   backward\   r1   zScatterOp.backwardN__name__
__module____qualname__staticmethodr0   r5   r   r   r   r   r*   U   
    
r*   c                   @   r)   )GatherOpc                 C   r+   r,   r2   r.   r   r   r   r0   d   r1   zGatherOp.forwardc                 C   r+   r,   r-   r3   r   r   r   r5   h   r1   zGatherOp.backwardNr6   r   r   r   r   r<   a   r;   r<   c                   @   r)   )AllGatherOpc                 C   r+   r,   r2   r.   r   r   r   r0   r   r1   zAllGatherOp.forwardc                 C   r+   r,   r(   r3   r   r   r   r5   x   r1   zAllGatherOp.backwardNr6   r   r   r   r   r=   o   
    
r=   c                   @   r)   )ReduceScatterOpc                 C   r+   r,   r>   r.   r   r   r   r0      r1   zReduceScatterOp.forwardc                 C   r+   r,   r2   r3   r   r   r   r5      r1   zReduceScatterOp.backwardNr6   r   r   r   r   r@      r?   r@   c                 C   s
   d| _ d S )NT)sequence_parallelZ	parameterr   r   r   #mark_as_sequence_parallel_parameter   s   
rC   c                 C   s   t | ddS )NrA   F)getattrrB   r   r   r   is_sequence_parallel_parameter   s   rE   c                    s8   t  }| dg t9   fdd}|S )Nr   c                    s6   d  d7  < d  krdd< t dd | S )Nr   r         ?)r   scaler   )r4   accumulation_stepsr   parameter_liststepr   r   __impl__   s   z6create_fused_allreduce_gradient_hook.<locals>.__impl__)r   r   r   len)rJ   rI   r   rL   r   rH   r   $create_fused_allreduce_gradient_hook   s   	rN   c                    s8   t  }| jdgtj  fdd}|S )Nr   c                      sV   d  d7  < d   dkr)t drj  d S j  d S d S )Nr   r   	main_grad)hasattrZ	allreducerO   r"   r4   r   rI   paramZpgrK   r   r   rL      s   
z:create_non_fused_allreduce_gradient_hook.<locals>.__impl__)r   r   r   r    r   ZautogradZno_grad)rR   rI   r   rL   r   rQ   r   (create_non_fused_allreduce_gradient_hook   s   
rS   c                 C   s   |dks	t j sd S t  }|jdkrd S g }|  D ]}t|r)|	| q|r=t
||}|D ]}|| q3d S |D ]}t||}|| q?d S )Nr   r   )r   r   Zis_initializedr   r   r   r   
parametersrE   appendrN   Z_register_backward_hookrS   )modelrI   Z fuse_sequence_parallel_allreducemp_groupparamsphookr   r   r   *register_sequence_parallel_allreduce_hooks   s*   


r[   c                   C   s,   t  rt  rt  rttjjjdS dS )NZfused_gemm_epilogueF)	r   Zis_compiled_with_cudaZis_compiled_with_rocmZis_compiled_with_xpurP   r   eageropslegacyr   r   r   r   is_fused_matmul_bias_supported   s   r_   c                       s2   e Zd Z						d fdd	Zdd Z  ZS )	ColumnSequenceParallelLinearNTFc	                    s  t    t }	|d u r|	 n|| _|d u r|	 jn|j| _|| _| jdk| _	|du s2J d|| _
|| j dksHJ d| d| j d|| j | _|| _| j | _| j	rt rt   | j|| jg| j| jdd| _W d    n1 s}w   Y  n| j|| jg| j| jdd| _| j	rd	nd| j_|r| j| jgtjjjd
d| jd	d| _| j	rd	nd| j_nd | _tj| _|rt stdddlm }
 |
| _d S d S )Nr   Fz\If sequence_parallel is True,                                         gather_output is Falser   z+Number of column of the weight for linear (,) must be divisible by model parallel size ()r   attrr   Zis_biasT        valuezYou set fuse_matmul_bias=True in ColumnSequenceParallelLinear, however, the paddle you are using not support this operation. Please set fuse_matmul_bias=False or use paddle compiled with cuda 11.6 or higher, or use xpu version.fused_linear)!super__init__r   r   r   model_parallel_groupr   
world_size_nameis_mpgather_outputZoutput_size_per_partition_weight_attr_helperget_default_dtype_dtyper   in_dynamic_moder   	rng_statecreate_parameterweightis_distributednninitializerConstantbiasFlinearr_   NotImplementedErrorpaddle.incubate.nn.functionalri   )selfin_featuresout_featuresweight_attrhas_biasrp   fuse_matmul_biasrW   namer   ri   	__class__r   r   rk      sr   




z%ColumnSequenceParallelLinear.__init__c                 C   s2   | j r	t|}n|}| j|| j| j| jd}|S N)r   )ro   r=   applyr   rx   r}   rn   )r   xinput_parallelr$   r   r   r   r0   ;  s   z$ColumnSequenceParallelLinear.forward)NNTFNNr7   r8   r9   rk   r0   __classcell__r   r   r   r   r`      s    Tr`   c                   @   r)   )MPScalec                 C   s   t |d| }|S )NrF   )r   rG   )r/   r   Z	mp_degreeoutr   r   r   r0   J  s   zMPScale.forwardc                 C   s   |S r,   r   )r/   Zdoutr   r   r   r5   O  s   zMPScale.backwardNr6   r   r   r   r   r   I  s
    
r   c                       s2   e Zd Z						d fdd	Zdd Z  ZS )	RowSequenceParallelLinearNTFc	                    s  t    || _|| _|du sJ d|| _|| _| j | _|| _	t
 }	|d u r.|	 n|| _|d u r:|	 jn|j| _|d u rG|	 jn|j| _| jdk| _|| j dksdJ d| d| j d|| j | _| jrt rt   | j| j| jg| j| jdd	| _W d    n1 sw   Y  n| j| j| jg| j| jdd	| _| jrdnd| j_|r| j| jgtjjjd
d| jdd	| _| jrt| j nd | _tj | _ d | _!|rt" st#dddl$m%}
 |
| _ | jr|rt&j'| _!d S d S d S d S )NTzjIf sequence_parallel is True,                                            input_is_parallel should be true.r   r   z(Number of row of the weight for linear (ra   rb   Frc   re   rf   zYou set fuse_matmul_bias=True in RowParallelLinear, however, the paddle you are using not support this operation. Please set fuse_matmul_bias=False or use paddle compiled with cuda 11.6 or higher.rh   )(rj   rk   r   r   input_is_parallelrq   rr   rs   rt   rn   r   r   r   rl   r   rm   r   ro   Zinput_size_per_partitionr   ru   r   rv   rw   rx   ry   rz   r{   r|   r}   rC   r~   r   mp_scaler_   r   r   ri   r   r   )r   r   r   r   r   r   r   rW   r   r   ri   r   r   r   rk   U  s   








z"RowSequenceParallelLinear.__init__c                 C   s   |}| j r9| jd ur| | j| j}nd }| j|| j|| jd}t|}|d u r5| jd ur5|| j }|S |}|S | j|| j| j| jd}|S r   )	ro   r   r}   rm   r   rx   rn   r@   r   )r   r   r   r}   Zoutput_parallelZoutput_r$   r   r   r   r0     s$   


z!RowSequenceParallelLinear.forward)NTFFNNr   r   r   r   r   r   T  s    ar   )!r   r   r&   Zpaddle.autogradr   Zpaddle.baser   Zpaddle.distributedr   Z&paddle.distributed.fleet.meta_parallelr   Z3paddle.distributed.fleet.utils.hybrid_parallel_utilr   Z	paddle.nnr	   r
   r~   r   r!   r(   r*   r<   r=   r@   rC   rE   rN   rS   r[   r_   r`   r   r   r   r   r   r   <module>   s0   c