o
    "je                     @   s   d dl Z d dlZd dlmZmZmZmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ d
dlmZmZ ee jZ G dd deZ!dd Z"dddZ#dd Z$dddZ%edG dd deZ&dS )    N)ProgramStats_append_grad_suffix__find_op_path__get_no_grad_set_name_rename_arg_)OP_ROLE_KEYOpRole)core)unique_name   )OperatorDistAttr)get_loss_opinsert_dependencies_for_two_opsis_backward_opis_recompute_exclude_opis_recompute_op6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_dist_op_desc_original_idset_var_dist_attr)
get_logger   )PassBaseregister_passc                       sX   e Zd Z fddZedd Zedd Zdd Zd	d
 Zg fddZ	dd Z
  ZS )RecomputeStatec                    s&   t  j||d i | _g | _g | _d S )N)blockops)super__init__seg_op_deps_checkpoints_reserved_vars)selfr   r   	__class__ r/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/passes/auto_parallel_recompute.pyr   /   s   
zRecomputeState.__init__c                 C      | j S N)r   r!   r$   r$   r%   checkpoints5      zRecomputeState.checkpointsc                 C   r&   r'   )r    r(   r$   r$   r%   reserved_vars9   r*   zRecomputeState.reserved_varsc                 C   s   t dd | jD S )Nc                 s   s    | ]}t |V  qd S r'   )r   .0opr$   r$   r%   	<genexpr>>   s    z.RecomputeState.is_recompute.<locals>.<genexpr>)anyr   r(   r$   r$   r%   is_recompute=   s   zRecomputeState.is_recomputec                 C   s\  t | jD ]\}}t|r d S |jD ]'}|| jv r&| j| d |g qi | j|< |g| j| d< g | j| d< q|jD ]'}|| jv rQ| j| d |g q>i | j|< g | j| d< |g| j| d< q>t|sv| j|j t	|svq|
d}d|vr|n|d d }|| jvr|g| j|< q| j| d d |ksJ d| j| |g qd S )	NZvar_as_input_opsZvar_as_output_opsop_namescope_exclude_rcir   z0The recompute segment's ops should be continuous)	enumerater   r   input_arg_namesZvar_op_depsextendoutput_arg_namesr   r   r   attrr   )r!   ir.   nameZseg_namer$   r$   r%   build_states@   s:   







zRecomputeState.build_statesc                 C   s   g }| j  D ]"}t|dkrq||d |d d g | j| j|d  j qt|ddD ]}|t|k sBJ d	|t||
| q0|S )Nr   r   r4   T)reversezMthe no_recompute_segments idx [{}] should be lower the number of segment [{}])r   valueslenappendr   r7   r   r8   sortedformatpop)r!   no_recompute_segmentssegmentsZsegment_idxr:   r$   r$   r%   get_recompute_segmentsf   s    z%RecomputeState.get_recompute_segmentsc                 C   s  dd | j D }d|vrd|vrdS d}|t| j k r| j | }d|jv r&n|jdkr7| j|j |d	7 }q|jd
vrA|d	7 }q|jdkrHdnd}||dur]t||r]|d	7 }q||}t	d}t
d|dg}| j| | jj|dtjjjddd}	dg}
|j}t||	|
|}|ddu rdnt|d}| jj|jdi d|	i|ddd}|d|d t|||
| | j || |j||g |jdd |jdd ||	j| |d7 }|t| j k s| j   dS )z
        If program's foward part has 'dropout' op, this function will insert
        a seed op before it to guarantee that two dropout op have the same outputs.
        c                 S   s   g | ]}|j qS r$   typer,   r$   r$   r%   
<listcomp>}   s    zDRecomputeState.modify_forward_desc_for_recompute.<locals>.<listcomp>dropoutfused_dropout_addNr   Zgradseedr   )rJ   rK   Zseed_tensorZSeedZrc_seed.tmpZint32F)r;   dtyperH   persistablestop_gradientr4   Zfix_seedZOutT)rL   Z	force_cpu)indexrH   ZinputsZoutputsattrsr2   r   )!r   r?   rH   r    r7   r8   inputget_op_dist_attr_for_programr
   generateZgenerate_with_ignorable_keyjoinr@   r   
create_varr	   ZVarDescZVarTypeZ
LOD_TENSORprocess_meshr   r9   int_insert_op_without_syncidx	_set_attrr   insertdescZ	set_inputset_input_dist_attrr;   _sync_with_cpp)r!   dist_contextZop_typesZop_idxcur_opZseed_tensor_namecur_op_dist_attrZop_unique_nameZvar_unique_nameZseed_varref_dims_mappingref_process_meshZseed_var_dist_attrrL   Zseed_opr$   r$   r%   !modify_forward_desc_for_recomputex   s   





	Iz0RecomputeState.modify_forward_desc_for_recompute)__name__
__module____qualname__r   propertyr)   r+   r1   r<   rF   rg   __classcell__r$   r$   r"   r%   r   .   s    

&r   c                 C   s2   t | j D ]}|j| j|kr|  S qdS )Nr4   )ranger_   Zop_sizer.   )r   rc   r\   r$   r$   r%   _find_op_index   s
   rn   c                 C   sh   |du rt  }nt|}t  }|  D ]}d|jv r n|jr'|t|j q|tt	t| |S )zget no grad varNz@GRAD)
setr   Z	list_varsr;   rQ   addr   updatelistmap)programno_grad_setno_grad_set_namevarr$   r$   r%   _get_stop_gradients   s   
rx   c           
      C   s   t | dkrg S g }| D ]N}t|tjjr|j}t|tr!|d }d}| D ]}||r5|	|j
r5q'||vr;d}q'|rZ|j }	|	| t|	|| |	ttj ||	 q|S )zD
    Get the recomputed ops which will insert the backward part
    r   FT)r?   
isinstancepaddleZstaticOperatorr_   tupler8   Zhas_varrw   rP   Z	append_op	copy_fromr   r]   r   r   ZBackwardr@   )
Zdescsr   
main_blockvars_should_be_holdrb   Zresult_descsr_   Z	is_neededr;   Znew_op_descr$   r$   r%   _add_needed_descs_to_block   s.   



r   c                 C   s"   t | |}t|  |gg |}|S r'   )rx   r   global_block)main_programlossru   rv   op_pathr$   r$   r%   _find_op_path  s
   
r   Zauto_parallel_recomputec                       sN   e Zd Z fddZdd Zdd Zddd	Zd
d Zdd Zdd Z	  Z
S )RecomputePassc                    s>   t    | dd  | dd  | dd  | dg  d S )Nr   rb   ru   rD   )r   r   Zset_attrr(   r"   r$   r%   r     s
   
zRecomputePass.__init__c                 C   s(   |  dd u r	dS |  dd u rdS dS )Nrb   Fr   T)get_attrr(   r$   r$   r%   _check_self  s
   zRecomputePass._check_selfc                 C   s   dS NTr$   )r!   Z
other_passr$   r$   r%   _check_conflict  s   zRecomputePass._check_conflictr   c                 C   s   dd }t |}dd t|D }dd t|D }d}d}	d}
t|D ]S\}}|
| |k r6|	d7 }	|| |t |d k rN|jj||d  jjkrN|
d7 }
|
| |k rUq$t|D ]\}}|jj|krv|d7 }|| | || |j qYq$t ||	| ksJ d|	| t |||fS )	zf
        Get ops and op_names of each process mesh excluding ops within the first "sr" chunks
        c                 S   s$   t | st| r| dd d S d S )Nr2    )r   r   r]   )r.   r$   r$   r%   reset_recomupte_op%  s   z<RecomputePass.get_ops_per_device.<locals>.reset_recomupte_opc                 S      g | ]}g qS r$   r$   r-   _r$   r$   r%   rI   *      z4RecomputePass.get_ops_per_device.<locals>.<listcomp>c                 S   r   r$   r$   r   r$   r$   r%   rI   +  r   r   r   z~The sum of pushed_ops_count and reset_ops_count must be the same as lenght of ops, but the sum is {} while lenght of ops is {})r?   rm   r5   	dist_attrrY   r@   rH   rB   )r!   r   all_ops_process_meshssrr   Zall_process_meshes_countZops_of_stagesZop_names_of_stagesZpushed_ops_countZreset_ops_countZchunk_idop_idr.   idrY   r$   r$   r%   get_ops_per_device   s>   z RecomputePass.get_ops_per_devicec           H      C   s  |  d}|  d}|  d}|  d| _|  dd| _|  dg | _| }t|||}g }	|D ]}
|
jj|	vr@|	|
jj q1| 	||	| j\}}t
|}dd	 |D }| jD ]j}|d
 }|dkre|n|}|d }|d }|d }t
|}t
|}|| | }t
|}t|D ]9\}}d}t
|}t|| d D ]&}||||  |kr||k r|d7 }|| tt|| || |  qqqYtd|  t|D ]&\}}|D ]}t|| | r|| | d}|| | d|d  qqt||} |  sd S | | j |   | |}!|!g krd S t|!D ]I\}\}"}#td|d  dt
|! d td| j|" j| j|" j| j|" j td| j|#d  j| j|#d  j| j|#d  j qg }$|!D ]}%|$| |%d |%d  qkt |$t | j! }&tdt
|&|& |$| j" |$| #  tt |$t | j!B }$i }'i }(|j$% })t|!d d d D ]\}}%||%d |%d  }*d| }+|*D ]p}
g },|,|
j |,|
j | j&|
}-|-d usJ |,D ]O}.|'|.j(s|.|$v rq|.|'vr;|-j}/|.|
jv r|-)|.}0n|-*|.}0|.|+ |'|.< |'|.}1|j+|'|. |1j,|1j-|1j|1j(|1j.d}2t/| j|2|0|/ qqt0|*|)||$| j}3|'D ]}4t1|3|4|'|4  qJ||%d d  }5d|3g|(|5j23 < q|j}6t4|}7t5||7}8| jj6}9|8dksJ tt
|6d |8dD ]}|6| }:g },|,|:j |,|:j |'D ]};|;|,vrq| 7|:|' t1|:j2g|;|'|;  q|:j23 }<|<|9j8v ru|9j8|< }=|=|(v ru|(|= d ru|:j9}>|>d dkr |6|>d  jdkr |>d8 }>|>d dkr |6|>d  jdks|(|= d }3d }?t:tt|3D ]1\}@}A|j;|>dd}?|?j2}B|B<|A |B=|B>  | j?|A3 }C|Cd us9J | @|?|C|' qd|(|= d< |?ru|j|?j9d  }D|?}E| j&|Dj}F| j&|Ej}G|F|GkrutA||>|D|E| jddd d! q|B  d S )"Nr   ru   rD   rb   r   r   Zrefined_ops_patternsc                 S   r   r$   r$   r   r$   r$   r%   rI   `  r   z4RecomputePass._apply_single_impl.<locals>.<listcomp>nummain_opspre_opssuf_opsr   z,The excluded ops in recompute segments are:
r2   r3   zrecompute segment[/]z!segment start op: [{}]: [{}] [{}]zsegment end op: [{}]: [{}] [{}]zhfound [{}] vars which cross recompute segment: [{}],better checkpoints might be set to reduce those varsr4   z.subprog_%d)r;   shaperO   rH   rP   rQ   TsumZnoprG   FZrecompute_segment_dep)r1   syncr2   )Cr   _dist_contextZ_srZ_refined_ops_patternsr   r   r   rY   r@   r   r?   r5   rm   r7   rr   loggerinfor   r9   r]   r   r1   rg   r<   rF   debugrB   r   rH   r6   r8   Zget_out_of_subgraph_varsro   r)   r+   Zget_input_nodesrt   Z_create_blockrU   rw   rP   Zget_input_dims_mappingZget_output_dims_mappingrX   r   rO   rQ   r   r   r   r_   original_idr   rn   dist_op_contextreset_op_dist_attrZgrad_op_id_to_op_idr\   reversedr[   r}   Zset_original_idr   Z$get_op_dist_attr_for_program_with_idset_op_dist_attrr   ra   )Hr!   r   Zstartup_programcontextr   ru   rD   r~   r   r   r.   Zops_devicesZop_names_devicesZall_ops_lenZall_exclude_ops_idsZrefined_ops_patternr   r   r   r   Zmain_start_idZmain_ops_lenZpattern_opsZpattern_ops_lenr   Zop_names_deviceZpattern_countZops_len_devicer:   Zexclude_ops_idsr   Zrc_mark_strZrc_staterE   Zidx1Zidx2r   segmentZ
cross_varsvar_name_dictZckpt_ops_dictZbuffer_blockZfwd_opsZ
var_suffixZinput_and_output_namesrd   r;   rf   re   Zref_varZrc_varZsegment_descskeyZckpt_opr   Zloss_opZloss_op_idxr   Zgrad_opvarnamer   Z	fwd_op_idr\   Zrc_opr   Zop_descZrc_descZfwd_op_dist_attrZprior_opZposterior_opZ
prior_meshZposterior_meshr$   r$   r%   _apply_single_implH  s  








 









	%



""


z RecomputePass._apply_single_implc                 C   s   | j |}|d usJ |jD ]}|| v r$||}||| | q|jD ]}|| v r=||}||| | q(d S r'   )	r   rU   r6   keysZget_input_dist_attrr`   r8   Zget_output_dist_attrset_output_dist_attr)r!   r.   r   Zop_dist_attrrT   in_dist_attroutputout_dist_attrr$   r$   r%   r   P  s"   



z RecomputePass.reset_op_dist_attrc           	      C   s   t  }d|_|j|_|j|_|j|_|j D ]!}|| v r-|j| }||| | q|j| }||| q|j D ]!}|| v rT|j| }|	|| | q>|j| }|	|| q>| j
|| d S r   )r   r1   Zimpl_idxZ	impl_typerY   Zinputs_dist_attrsr   r`   Zoutputs_dist_attrsr   r   Zset_op_dist_attr_for_program)	r!   r.   Zold_dist_attrr   Znew_dist_attrrT   r   r   r   r$   r$   r%   r   `  s,   



zRecomputePass.set_op_dist_attr)r   )rh   ri   rj   r   r   r   r   r   r   r   rl   r$   r$   r"   r%   r     s    
(  
r   r'   )'loggingrz   Zpaddle.base.backwardr   r   r   r   r   Z/paddle.distributed.fleet.meta_optimizers.commonr   r   Zpaddle.frameworkr	   Zpaddle.utilsr
   Z#auto_parallel.static.dist_attributer   Zauto_parallel.static.utilsr   r   r   r   r   r   r   r   Zutils.log_utilsr   Z	pass_baser   r   INFOr   r   rn   rx   r   r   r   r$   r$   r$   r%   <module>   s&   (

 !

