o
    "jS                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZ d dlmZ ddlmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZmZmZ g dZG dd dZdS )    N)PassManagernew_pass)	get_flags)append_backwardprogram_guard)unique_name   )
get_logger   )init_auto_parallel_rng   )Partitioner)get_world_process_group)	Resharder)get_pp_stageis_sequential_runuse_new_executor)Zfused_gemm_epilogue_passZ fused_linear_param_grad_add_passZfused_dropout_add_passc                   @   sd   e Zd Zdd Zedd Zedd Zddd	Zdd
dZ	dddZ	dd Z
dd Zdd ZdS )Parallelizerc                 C   sB   || _ || _|| _| jjsJ | jj| _| jj| _tt	j
| _d S N)_mode
_completer_dist_contextZ_is_initializedZpass_context_pass_contextZstrategy	_strategyr	   loggingINFO_logger)selfmodeZ	completerdist_context r    x/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/auto_parallel/static/parallelizer_v2.py__init__(   s   

zParallelizer.__init__c                 C   s
   | j dkS )Ntrainr   r   r    r    r!   is_train1      
zParallelizer.is_trainc                 C   s
   | j dv S )N)evalZpredictr$   r%   r    r    r!   is_test5   r'   zParallelizer.is_testNc                 C   s&   t  }|j}|D ]}| || qd S r   )r   Zranksparallel)r   parameter_listZworld_process_groupZ	all_ranksrankr    r    r!   parallel_all9   s
   zParallelizer.parallel_allc                 C   s  | j j}| j j}| j j}| jr|r| j j}| ||||}t }| |||||\}}}| j	
dt | | j t }t| j |}	|	|||\}
}}t  | j	
dt | | j t }| |
||| | j	
dt | | j t|
||| j |}|  | j	
dt | | j t }| |
||| | j	
dt | | j nt }| ||d d g \}}}| j	
dt | | j t }t| j |}	|	||g \}
}}| j	
dt | | j t }| jjjsdn| jjj}t|
||| j g |}|  | j	
dt | | j t }| |
||| | j	
dt | | j | jrV|
j}|
jdd}
|jdd}||
_|
| j j|< || j j|< d S )	Nz8within parallel apply_pre_optimization time: {}, mode {}z-within parallel partitioner time: {}, mode {}z+within parallel optimizer time: {}, mode {}z)within parallel reshard time: {}, mode {}z9within parallel apply_post_optimization time: {}, mode {}r   T)Zfor_test)r   serial_main_programserial_startup_programserial_optimizerr&   serial_loss_generate_backwardtime_apply_pre_optimizationr   debugformatr   r   	partitionr   _generate_optimizerr   reshard_apply_post_optimizationr   pipelineenableZmicro_batch_sizer)   _pipeline_optcloneZdist_main_programsZdist_startup_programs)r   r,   r+   r.   r/   r0   r1   params_gradsZtime0partitionerZdist_main_progZdist_startup_progZdist_params_gradsZ	resharderZ	micro_bszZpipeline_optr    r    r!   r*   A   s  

zParallelizer.parallelc                 C   sf   t || t||| jjd}W d    n1 sw   Y  | j| | jj| | j| |S )N)r+   Zdistop_context)	r   r   r   Zdist_op_contextr   Zcomplete_backward_annotationZblock_stateZparse_backward_blocksZ_complete_var_chunk_id)r   main_programstartup_programlossr+   r?   r    r    r!   r2      s   	zParallelizer._generate_backwardc              	   C   s   |j }t|}|| j_|| jj_ d|_t||" td |	|}W d    n1 s/w   Y  W d    n1 s>w   Y  | j
| |S )NFZopt_)Z_learning_ratecopydeepcopyr   Z_serial_optimizerZ_sortedr   r   guardZapply_gradientsr   Zcomplete_update_annotation)r   rA   rB   	optimizerr?   Zlearning_rateZoptimizer_opsr    r    r!   r8      s   

z Parallelizer._generate_optimizerc                 C   s  | j d u rd S | j jjrzt| j j }| j|d< ||d< ||d< | jjd | jjd  |d< | j	d
|d |d	  |d	 d
krXtd|}||g|g| j | }n"|d	 dv rv||d< td|}||g|g| j | }ntd| jr| j jjrt| j j }| j|d< ||d< | j|d< ||d< td|}	|	|g|g| j | jd}| jd}| jd}| jd}| jr| j jjrt| j j }| j|d< d |d< ||d< td|}
|
|g|g| j |||fS )Nr   r?   rC   ZinputslabelsZ
input_datazApplying AMP-{}-{} ...ZdtypelevelZo1Zauto_parallel_amp)o2Zo3Zbase_optZauto_parallel_fp16z%AMP level should be one of o1, o2, o3r   Zauto_parallel_quantizationrA   rB   Zno_grad_setZauto_parallel_recompute)r   ampr<   rD   rE   to_dictr   Zserial_feed_varsr   infor6   r   applyr   Zget_loss
ValueErrorr&   Zqatr   get_attrZ	recompute)r   rA   rB   rC   rG   r?   configZauto_parallel_amp_passZauto_parallel_fp16_passZauto_parallel_quantization_passZauto_parallel_recompute_passr    r    r!   r4   	  st   












z$Parallelizer._apply_pre_optimizationc                 C   s  | j d u rd S | j jjr-t| j j }| j|d< ||d< td|}||g|g| j	 | j
r| j jjr| j jjrt| j jjdkrd| j jjv rd }| j jjrZt| j j }i }| j|d< ||d< | j jj|d< ||d< |d urx|d nd	|d
< td|}||g|g| j	 | j jjrt| j j }| j|d< ||d< | j jj|d< td|}	|	|g|g| j	 | j jjrt| j j }| j|d< ||d< ||d< td|}
|
|g|g| j	 | j	d}| j jjrttdddkr| jd tdi }||g|g| j	 | j
r6t| j j }| j|d< ||d< ||d< td|}||g|g| j	 t sPi }| j|d< td|}||g|g| j	 | j
rl| j jjrld| j j_| j jj| j j_d| j j_| j
r| j jjrt| j j }| j|d< ||d< td|}||g|g| j	 | j jjrt  st| j j }| j|d< td|}||g|g| j	 t!dd }g }| j
r| j jjrt| j jjdkrg }| j jjD ]}|t"v r|r|#| q|#t| qt$|}||g|g i |_%||j%d< | j
rY| j jjr[t  r]| j jj&}|r6ttdddkr6| jd i |_'|| j jj(| j jjt| jj)t*| j|| jj+| jd|j'd< d S d S d S d S )Nr   Zglobal_rankZ,auto_parallel_sequence_parallel_optimizationr   Zfuse_gemm_epilogueZ	enable_spr?   rI   Zo0Z	amp_levelZ$auto_parallel_fused_linear_promotionZuse_shardingZ(auto_parallel_data_parallel_optimizationZauto_parallel_shardingZCUDA_DEVICE_MAX_CONNECTIONS0r   zYou set mp_optimization.allreduce_matmul_grad_overlapping=True, but you did not set environment variable CUDA_DEVICE_MAX_CONNECTIONS=1, which may leads to performance loss. Try to export CUDA_DEVICE_MAX_CONNECTIONS=1 for better performance.!allreduce_matmul_grad_overlappingZrank_idZauto_parallel_grad_clipZ.auto_parallel_supplement_explicit_dependenciesT!auto_parallel_gradient_merge_passZauto_parallel_pipelineZFLAGS_enable_pir_in_executorZ	pass_listzYou set pipeline.enable_send_recv_overlap=True, but you did not set environment variable CUDA_DEVICE_MAX_CONNECTIONS=1, which may leads to performance loss. Try to export CUDA_DEVICE_MAX_CONNECTIONS=1 for better performance.)enable_send_recv_overlapschedule_modeZnum_micro_batchesZ	pp_degreeZpp_stageZ
vpp_degreer   Zstandalone_opt),r   Zsp_optimizationr<   rD   rE   rL   r   r   rN   r   r&   Zfused_linear_promotionZfused_passeslenZfused_passes_listrK   Zdp_optimizationZshardingrP   Zmp_optimizationrS   intosgetenvr   warningr   r;   Zgradient_mergeZaccumulate_stepsZk_stepsZavgr   r   NEW_IR_PASSappendr   Z	_pass_optrU   r=   rV   Zprocess_meshesr   Z_num_model_chunks)r   rA   rB   r,   r?   rQ   Zsp_passZ
amp_configZfused_linear_promotion_passZdp_passZauto_parallel_sharding_passZ&allreduce_matmul_grad_overlapping_passZauto_parallel_clip_passZ
APSED_passrT   Zauto_parallel_pipeline_passZ	enable_irZir_pass_listZnew_pass_listpZpass_managerrU   r    r    r!   r:   Q  s.  

















z%Parallelizer._apply_post_optimizationr   )__name__
__module____qualname__r"   propertyr&   r)   r-   r*   r2   r8   r4   r:   r    r    r    r!   r   '   s    	



 "
Hr   )rD   r   rY   r3   Zpaddle.distributed.passesr   r   Zpaddle.frameworkr   Zpaddle.staticr   r   Zpaddle.utilsr   Zutils.log_utilsr	   randomr   r@   r   Zprocess_groupr   r9   r   utilsr   r   r   r\   r   r    r    r    r!   <module>   s    