o
    "juw                    @   s>  d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ ddlmZmZ d	d
lmZmZ d	dlmZ d	dlmZmZmZ d	dlmZ d	dlmZm Z m!Z!m"Z"m#Z#m$Z$ ee%ej&ddZ'g dZ(dZ)dd Z*dd Z+dd Z,dd Z-dd Z.dd Z/dd  Z0d!d" Z1G d#d$ d$Z2dS )%    N)contains_spmd_ruleget_phi_spmd_ruleget_spmd_rule)Operator)
get_logger)OpRole)core   )ProcessMeshcompute_compatible_process_mesh   )OperatorDistAttrTensorDistAttr)_node_id)_gradient_sync_by_partial_ops*find_compatible_distributed_operator_impls(find_distributed_operator_impl_containerget_world_process_group)__no_shape_var_type___g_gradient_clip_opsis_gradient_clip_opis_loss_grad_op
is_loss_opis_naive_data_parallelz&%(asctime)s-%(levelname)s: %(message)s)fmt)create_py_readercreate_double_buffer_readerwhilereadZ Auto_Parallel_Completion_Skippedc                 C   s   |  ddt  d S )Nop_namescope/)Z	_set_attr_skip_propagation_prefix)op r$   s/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/auto_parallel/static/completion.py$mark_as_sharding_propagation_skip_op=   s   r&   c                 C   sP   t | tjjjr| }nt | tr| j}ntd|  d|do't	|
dv S )Nz*static mode operator is expected but got []r    )
isinstancepaddlebaseZ	libpaddleZOpDescr   descRuntimeErrorhas_attrr"   attr)r#   op_descr$   r$   r%   is_sharding_propagation_skip_opA   s   
r0   c                 C   s:   | sdS dd }d}| D ]}|||\}}|s dS q|S )z?Compute the compatible dim mapping given a list of dim mapping.Nc                 S   s4   | dkrd|fS |dkrd| fS | |krd| fS dS )NT)FNr$   )Zdm1Zdm2r$   r$   r%   &_compute_compatible_dim_mapping_of_twoR   s   zNcompute_compatible_dim_mapping.<locals>._compute_compatible_dim_mapping_of_twor1   r$   )Zdim_mapping_listr2   compatible_resultmapping
compatibler$   r$   r%   compute_compatible_dim_mappingM   s   	r6   c                 C   sx   | sdS t | d }| D ]}|du r dS t ||kr dS qg }t|  D ]}tt|}|du r4 dS || q%|S )zoCompute the compatible dims mapping given a list of dims mapping.
    Each of dims mapping is also a list.
    Nr   )lenzipr6   listappend)dims_mapping_listlengthdims_mappingr3   Zdim_mappingsZcompatible_dim_mappingr$   r$   r%   compute_compatible_dims_mappinge   s$   r>   c                 C   s^   t  }t  }| d u r|d u rd S | d urt | j}|d ur"t |j}||}tt|}|S N)setZprocess_idsunionr
   r9   )Zpm1Zpm2Zprocess_set1Zprocess_set2Zmerged_process_setmerged_process_meshr$   r$   r%   merge_process_mesh_two|   s   


rC   c                 C   sn   | d u rdS t t| D ]}| | dk s| | t|jkr  dS qt t|jD ]}| |dkr4 dS q(dS )NFr1   r   T)ranger7   shapecount)r=   process_meshir$   r$   r%   _validate_dims_mapping   s   rI   c                 C   sv   t dd}t|tr| }|dkrdnd}t|}g d}t d}|dkr-|d | jj}|o:t	|o:||v S )NZFLAGS_infer_spmd_enableTtrueF)Z	matmul_v2Zelementwise_divZgeluZ!fused_softmax_mask_upper_triangleZelementwise_addelementwise_mulZassignscaleZdropoutZ
reduce_sumZ
layer_normZlookup_table_v2Zreshape2Z
transpose2splitZ
unsqueeze2ZsiluZPARALLEL_CROSS_ENTROPYZsoftmax_with_cross_entropy)
osgetenvr(   strlowerboolr:   	serial_optyper   )dist_openableZ__adapted_ops__Zparallel_ceop_typer$   r$   r%   _can_apply_infer_spmd_rule   s   


rX   c                 C   sh   t | }td| jj|j || }|p|}|| |}td| jj| jj	| jj
 |o3| S )Nz*Update Op [{}] using DistOpContainer [{}].z'Op [{}] use dist op impl [{}] idx [{}].)r   _loggerdebugformatrS   rT   update_dims_mappingZmapping_to_dist_operator_impl	dist_attr	impl_typeimpl_idx)rU   original_op_dist_attrchangedZdist_op_containerupdatedZrevertedr$   r$   r%   ,_update_op_dims_mapping_and_distoperatorimpl   s&   


rc   c                   @   s   e Zd Zdd Zd+ddZd+ddZdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zd,ddZdd Zd,ddZd,dd Zd,d!d"Zd#d$ Zd,d%d&Zd,d'd(Zd)d* ZdS )-	Completerc                 C   s   |d usJ || _ d| _d S )NF)_dist_context_has_prepared)selfdist_contextr$   r$   r%   __init__   s   
zCompleter.__init__Tc                 C   s  d}|  r| d u rdS | }| tv rdS | j|}|d us&J |dr-dS |j}|rg }|jD ]9}|	 d urp|	  dksW|	  dksW|	  dkrXq7| j
|}	|	j|jkrp|	| }
||
 q7|| t|}t||jsdS |d ur||kr||_d}|S g }|jD ]?}|	 d ur|	  dks|	  dks|	  dkst|	 rq| j
|}	|	j|jkr|	| }
||
 q|| t|}t||jsdS |d ur||kr||_d}|S )NFr=   r   r   r   T)is_varvarrT   r   re   get_tensor_dist_attr_for_graphis_annotatedr=   inputsr#   Zget_op_dist_attr_for_graphrG   get_output_dims_mappingnamer:   r>   rI   outputsr0   get_input_dims_mapping)rg   tensor_nodefwdra   tensor_desctensor_dist_attrtensor_dims_mappingr;   Zpred_op_nodeop_dist_attrop_dims_mappingcompatible_dims_mappingZsucc_op_noder$   r$   r%    _update_tensor_node_dims_mapping   s   




'




z*Completer._update_tensor_node_dims_mappingc                 C   sD  d}|  }| r|  d u rdS | tv st|  r dS | j|}|j}t	|}|r4|j
}n|j}|D ]{}	|	 rE|	 d u rFq9|	  tjjjkrRq9|	 }
|r`||
 }n||
 }|rjq9| j|	}|j|jkr|j}|r||
 }n||
 }t||g}t||jsq9|d ur||kr|r||
 | n||
 | d}q9t|rt d!|j"j t#|||S t d!|j"j t$||d}|d urd}t	|}|}|D ]'}|%|}|rd}|&|r|' r|j|_(|j)|_*d} n||_|}q|r||_d}|S ||_d}|S )NFTz5Op [{}] update dims mapping using New InferSPMD Rule.z7Op [{}] update dims mapping using Original DistOp Rule.rt   )+r#   is_oprT   __skip_dims_mapping_op__r0   re   get_dist_op_for_graphr]   copydeepcopyrn   rq   rj   rk   r   VarDescVarTypeREADERZis_annotated_input_dims_mappingrp   Z is_annotated_output_dims_mappingrl   rG   r=   rr   ro   r>   rI   set_input_dims_mappingset_output_dims_mappingrX   rY   rZ   r[   rS   rc   r   r\   is_auto_compatiblevalidate_dist_attrr^   idxr_   )rg   op_nodert   ra   r/   rU   rx   r`   Z	node_listrs   ru   Z	annotatedrv   rw   ry   rz   op_dist_implsnot_compatiblebackup_op_dist_attrZbackup_changedop_dist_impldim_changedr$   r$   r%   _update_op_node_dims_mapping6  s   





z&Completer._update_op_node_dims_mappingc           	      C   s   d}| j D ]F\}}| j|}| j|}|j|jkrq|j}|j}t||g}t||js1 dS |d ur>||kr>||_d}|d urK||krK||_d}q|S )NFT)_node_pairs_between_graphsre   get_dist_attr_for_graphrG   r=   r>   rI   )	rg   ra   parent_node
child_nodeparent_node_dist_attrchild_node_dist_attrZparent_node_dims_mappingZchild_node_dims_mappingrz   r$   r$   r%   #_update_dims_mapping_between_graphs  s>   z-Completer._update_dims_mapping_between_graphsc           	      C   s   | j j}g d}|D ]M}| d ur|  |v rq
| j |}|jD ]2}| rV| d urV|  tj	j
jkr<q$| }| j |}|j|jkrV|| }||_q$q
d S )N)r   r   r   )re   _serial_ordered_op_nodesr#   rT   r   rq   rj   rk   r   r   r   r   rl   rG   ro   rp   r=   )	rg   Zop_nodesZrelated_reader_opsr   rx   rs   ru   rv   ry   r$   r$   r%    _update_dims_mapping_for_special  s4   
z*Completer._update_dims_mapping_for_specialc           	      C   s   d}|sWd}dD ]E}|r| j jnt| j j}|D ],}| r/| d ur/| j||d}|r/d}| rD| d urD| j||d}|rDd}q| 	 }|rMd}q|rSd}nd}|r| 
  d S )NF)TFr|   T)re   serial_ordered_nodesreversedrj   rk   r{   r}   r#   r   r   r   )	rg   Zreach_fix_pointra   Zis_fwd	all_nodesnodeZtensor_changedZ
op_changedZgraph_changedr$   r$   r%   _update_dims_mapping  s>   

zCompleter._update_dims_mappingc           
      C   s2  | j |}|ds(|j}| j |}|j}t||g}|d ur(||kr(||_|jdkr/d S |jD ]4}| rf| d urf| j 	|}	|	drJq2t
|jdkrRq2t|	j|jg}|d urf|	j|krf||	_q2|jD ],}| r| d ur| j 	|}	|	drqjt|	j|jg}|d ur|	j|kr||	_qjd S )NrG   r   r   )re   r   rm   rG   r   rW   rn   rj   rk   rl   r7   rq   )
rg   r   nearest_op_noderx   rG   Znearest_op_dis_attrZnearest_process_meshcompatible_process_meshrs   rv   r$   r$   r%   _update_process_mesh_by_nearest  s`   









z)Completer._update_process_mesh_by_nearestc                 C   sl  dd }dd }dd }dd }| j  D ]m\}}| d	}| jj|}t| }	| j	|}
|
j
}|j}|	D ]"}| rH| d usR| r^| d ur^| j|}t||j}q<||_|| g }| d
d }d }|jD ]}| r| d ur|  |kr|}||  nqv||| d }t|	D ]}| r| d ur|  |krt|jdkr|} nq||| | dd }d }|jD ]}| r| d ur|  |kr|}q|| |D ]}| j|}||_|| q|j}| D ]1\}}|| jj||}| j|}|j|_|jD ]}|  |kr9| j|}|j|_q$q|j}| D ]>\}}|| jj||}|d u r]|| jj||}| j|}|j|_|jD ]}|  |kr| j|}|j|_qjqDq| j D ]*}d }|D ]}| j|}t||j}q|D ]}| j|}||_|| qqd S )Nc                 S   sF   t | d | D ]}| r | d ur |  |kr |  S qd S r?   )r   rj   rk   rp   nodesr   var_namer   r$   r$   r%    _find_nearest_tensor_node_beforeX     zUCompleter._update_process_mesh_for_specials.<locals>._find_nearest_tensor_node_beforec                 S   sF   | |d d  D ]}|  r | d ur |  |kr |  S qd S )Nr   )rj   rk   rp   r   r$   r$   r%   _find_nearest_tensor_node_aftera  r   zTCompleter._update_process_mesh_for_specials.<locals>._find_nearest_tensor_node_afterc           	      S   s  g }t  }g }||  t|dkr|d }|dd  }t||v r#q|j|j }|D ]}| rV| d urV|  t	j
jjkrVt|  dkrV|| || | r| d urd}|  dksz|  dksz|  dkr|d}|jD ]"}| r| d ur|  tv st|  dkrd} nq|jD ]"}| r| d ur|  tv st|  dkrd} nq|r|| || q+|t| t|dks|S )Nr   r   Tr   r   r   F)r@   r:   r7   r   rn   rq   rj   rk   rT   r   r   r   r   rE   r}   r#   r   add)	Zsource_nodeZrelated_nodesvisitedZfrontiercurZ	neighborsr   flagrs   r$   r$   r%   _find_nodes_related_to_condj  sf   








3zPCompleter._update_process_mesh_for_specials.<locals>._find_nodes_related_to_condc                 S   s   t | trt| jD ]	\}}d| j|< q
t | trW| j D ]}g }| |}|D ]}|d q)| 	|| q| j
 D ]}g }| |}|D ]}|d qH| || q=d S d S )Nr1   )r(   r   	enumerater=   r   inputs_dist_attrskeysrr   r:   r   outputs_dist_attrsro   r   )r]   rH   _arg_namenew_dims_mappingr=   r$   r$   r%   _make_dims_mapping_replicate  s(   



zQCompleter._update_process_mesh_for_specials.<locals>._make_dims_mapping_replicate	sub_block	Conditionr   Z
StepScopes)_while_op_nodesvaluesr#   Z_block_attr_idre   Zserial_graphZget_sub_graphr9   r   r   r]   rG   rj   rk   r}   r   rC   inputrn   rp   r:   extendr   r7   rq   outputr   itemsr   r   _array_nodes)rg   r   r   r   r   Zwhile_op_nodeZwhile_op_node_idxZsub_graph_idZ	sub_graphZsub_graph_nodesZwhile_dist_opZwhile_op_dist_attrrB   r   r]   Zcond_tensor_related_nodesZcond_tensor_nameZcond_tensor_nodeZstepscopes_tensor_nameZstepscopes_tensor_nodeZoutput_noderv   Zwhile_op_inputs_dist_attrstensor_nameZnearest_tensor_nodeZnearest_tensor_dist_attrZnode_dist_attrZwhile_op_outputs_dist_attrsZarray_node_listZ
array_noder$   r$   r%   !_update_process_mesh_for_specialsW  s  		;










!z+Completer._update_process_mesh_for_specialsc                 C   st   | j D ]4\}}| j|}| j|}|j|_t|j|jg}|d ur+|j|kr+||_|d ur7|j|kr7||_qd S r?   )r   re   r   rG   r   )rg   r   r   r   r   r   r$   r$   r%   #_update_process_mesh_between_graphsV  s0   
z-Completer._update_process_mesh_between_graphsc                 C   s  | j j}| j j}|D ]^}| j |}|dsq
d }|D ]#}|  dkr'q|jD ]}t|t|kr8|} nq*|d ur? nq|d u rEq
| j 	|}|d urh|dsht
|j|jg}	|	d urh|j|	krh|	|_q
d}
t|D ]\}}| j 	|}|jd ur|
dkr|}
| || qo|
d t|krd S t||
d d  D ])\}}|
| d }||d  }| j 	|}| j 	|}|jd usJ | || q||
 }|d |
 D ]}| || q|   |   d S )NrG   r   r1   r   )re   r   Z_serial_ordered_tensor_nodesrl   rm   r#   rT   rn   r   r   r   rG   r   r   r7   r   r   )rg   Zordered_op_nodesZordered_tensor_nodesrs   rv   Zfirst_op_noder   Zinput_tensor_noderx   r   Z%idx_of_first_op_node_has_process_meshr   Zoriginal_idxr   Znearest_op_dist_attrr$   r$   r%   _update_process_meshs  s|   





zCompleter._update_process_meshc           	      C   s  | j rd S i | _i | _g | _| jj}t|D ]\}}| r| 	 dkr/||f| jt
|< | 	 dkra| dd }| j|d d u rNg | j|< | j| | | j| |jd  | 	 dkr| dd }| j|d d u rg | j|< | j| | | j| |jd  | r| d ur|j dkr| jj|j d  |  d }|d urt|dd	 d
}|D ]\}}| j||f qqd| _ d S )Nr   Zread_from_arrayXr   Zwrite_to_arrayOutr   c                 S   s   | d S )Nr   r$   )xr$   r$   r%   <lambda>  s    z$Completer._prepare.<locals>.<lambda>)keyT)rf   r   r   r   re   r   r   r}   r#   rT   r   r   getr:   rn   r   rq   rj   rk   r   Zgraph_idZ_tensor_nodes_with_same_namerp   sorted)	rg   r   r   r   Zarray_var_nameZparent_nodesZsorted_parent_nodesr   r   r$   r$   r%   _prepare  sN   


zCompleter._prepareNc                 C   s   |du r	| j j}n|| j _t| j s+| j jdd |   |   |   | j   nt	
d | j jdd |   | | | j   | j   |S )a  Complete annotation for the partial annotated serial_main_program.
        Arguments:
            serial_main_program: partial annotated serial_main_program.
        Returns:
            serial_main_program: completed annotated serial_main_program.
        NT)Z
with_graphz+Default distributed attributed will be set.F)re   serial_main_program_serial_main_programr   
initializer   r   r   $copy_dist_attr_from_graph_to_programrY   info_update_dist_attr_for_dp$_complete_high_order_grad_annotationamend_dist_attr_for_programvalidate_dist_attr_for_programrg   r   r$   r$   r%   complete_forward_annotation  s    





z%Completer.complete_forward_annotationc                 C   s  t  j}t|}| jj}| D ]}||j_q| jj}| D ]}|j	}|j}||_t
|}	|jdkr5q|jD ] }
||
}|jsX| j|}|j}|jj|_||
|jj q8t|dd}|d urd}t
|}|D ]}|| ||r| r|j|_|j|_d} n||_ql|r|	|_n|	|_|jD ]=}
|j}||
}|jdv r||
}t|dkrdgdd tt|d	 D  }||
| | j|}||
|j_qqd S )
Nr   Tr|   F)fill_constantr   c                 S      g | ]}d qS r1   r$   .0r   r$   r$   r%   
<listcomp>O      z6Completer._update_dist_attr_for_dp.<locals>.<listcomp>r   ) r   ranksr
   re   Z_dist_tensors_for_programr   r]   rG   Z_dist_ops_for_programrS   r   r   rT   input_arg_namesget_serial_inputis_parameterget_dist_tensor_for_programr   r=   r   r\   r   r   r^   r   r_   output_arg_namesZget_serial_outputro   r7   rD   r   )rg   r   rG   Zdist_tensorsdist_tensorZdist_opsrU   rS   rx   r`   r   Zserial_tensorr   r   r   r   Zold_dims_mappingr   r$   r$   r%   r     s   










z"Completer._update_dist_attr_for_dpc                 C   s  |d u r	| j j}n|| j _| j   |   t }| j j}|D ]}| r| 	 dv r.q| j 
|}|j}|jD ]G}| r| d urt|jdkrNq:| }| }	||	}
|	|v r`q:| j |}|j|_|
jrr||	ndd | D |_||	 q:|jD ],}| r| d ur|  }	|	|v rq| j |}|j|_||	|_||	 qq|   |   |   |   | j   | j    | j !  d S )N)r   r   c                 S   r   r   r$   )r   rH   r$   r$   r%   r         z>Completer._complete_tensor_dist_attr_by_op.<locals>.<listcomp>)"re   r   r   r   r   r@   r   r}   r#   rT   r   r]   rn   rj   rk   r7   rp   r   rl   rG   r   rr   rE   r=   r   rq   ro   r   r   r   r   r   r   r   )rg   r   Zhas_set_dist_attrr   r   rU   rx   rs   ru   r   tensorrv   r$   r$   r%    _complete_tensor_dist_attr_by_op\  sp   








z*Completer._complete_tensor_dist_attr_by_opc           "      C   s&  |du r	| j j}n|| j _dd }dd }t| j}| j}| j j}|j}d}t	dt
|D ]}	||	 }
t|
dttjjjkrGq1t|
dttjjjkrkt||	d  dttjjjkrk|d7 }t|
dtttjjjttjjjB kr|
jd	ksJ  dS ||	 }|j |jv rG|||j|j  }|dusJ | j |}|j}t }||_|jD ]M}||jvr||jvr||| v r|| | }||}n|| }| j |j}n||jv r||}n||}|dusJ d
| d| || q|jD ]1}||| v sJ || | }||}|| }t! }||_||_| j "|| |#|| q| j $|| q1|jdkrt%t&||jsXJ |jd }||| v slJ d| d|| | }|| }| j |}|j}|j}t! }||_||_|| }| j "|| t }||_|jD ]	}| || q|#|| nY|jdkr|jd }|| }| j |}|j}|j} t! }||_| |_|jd }!||! }| j "|| t }| |_| || |#|!| n|jdv rq1t'dt(|j d| j $|| q1dS )z
        NOTE:
            [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
            This function is temporary to support high order gradient, and will be removed in the future.
        Nc                 S      d| v rdS dS N@GRADTFr$   rp   r$   r$   r%   _is_grad_var_name     zICompleter._complete_high_order_grad_annotation.<locals>._is_grad_var_namec                 S   $   | D ]}|j  |kr|  S qd S r?   r+   original_idopsidr#   r$   r$   r%   _get_op_by_id  
   zECompleter._complete_high_order_grad_annotation.<locals>._get_op_by_idr   op_roler   r   [] 's dims mapping is NONEsumsum op's output '' has no corresponding varfill_any_like)rE   r   got unexpect op [r'   ))re   r   r   r9   global_blockr   varsdist_op_contextgrad_var_to_varrD   r7   intr.   r   Zop_proto_and_checker_makerr   ZForwardZBackwardZLossrT   r+   r   grad_op_id_to_op_idget_op_dist_attr_for_programrG   r   r   r   ro    get_tensor_dist_attr_for_programr=   rr   r   r    set_tensor_dist_attr_for_programr   set_op_dist_attr_for_programallmap
ValueErrorrP   )"rg   r   r   r   r   r   r   r   Zappended_grad_timesr   r#   grad_op
forward_opfwd_op_dist_attrZfwd_op_process_meshgrad_op_dist_attr
input_namefwd_nameref_dims_mapping	input_varoutput_name
output_varrv   ref_fwd_var_nameref_fwd_varref_fwd_dist_attrref_fwd_dims_mappingref_fwd_process_meshr   ref_var_nameref_varref_dist_attrref_process_meshoutput_var_namer$   r$   r%   r     sD  













 z.Completer._complete_high_order_grad_annotationc           -         s  |du r	j j}n|j _dd   fdd}dd }fdd	}d}d
}t| jD ]\}}t|r7|}t|rF|jdksBJ |} nq-|dkrO|dusSJ dt	| j}	| j
}
j j}|jt|j t|t|	D ]}|	| }||kr|jdksJ t|jdksJ dt|jt|jdksJ dt|j|
|jd  }|
|jd  }|jd |jksJ j |}t }|j|_|j|_j || j |}t }|j|_|j|_||j}||j| j || qr|j |j v re||	d| |j |j  }|dusJ |!dr^|!dr^||||
 |"dj#}|"dj#}|j$| }|j$| }|jD ]}||j|j |j  }||||j
 qFqr||||
 qr|jdkrt%t& |jsvJ |jd }|v sJ d| d| }|
| }j |}|j}|j} t }||_| |_|
| }!j |!| t }|jD ]	}"|'|"| q||| | |_d}#t|d |d d
D ]%}$|	|$ }%t	t(|%jt(|j@ }&t|&dkrj |%}'|'j}# nq|#dusJ |#|_j || qr|jdkr[|jd }(|
|( })j |)}*|*j}|*j}+|jd },|
|, }!t }||_|+|_j |!| t }|+|_|'|(| ||,| j || qrt)dt*|j ddS )zSComplete the annotation of vars and ops in the backward phase for parallel program.Nc                 S   r   r   r$   r   r$   r$   r%   r     r   zACompleter.complete_backward_annotation.<locals>._is_grad_var_namec                    s*    | sJ d|  d| d |  d S )Nr   z] is not a grad varnme.r   )find)Zgrad_var_name)r   r$   r%   &_get_forward_varname_from_grad_varname  s   
zVCompleter.complete_backward_annotation.<locals>._get_forward_varname_from_grad_varnamec                 S   r   r?   r   r   r$   r$   r%   r     r   z=Completer.complete_backward_annotation.<locals>._get_op_by_idc                    s<  j | }t }|j}|jdkrP| jdkrP| dd }||}||jdd  }t	 }	||	_
||	_j ||	 |jD ]}
||
| q?||j| n|jD ]H}
|
| jvry|
| jvry|
 v rm |
 }||}n||
 }j |j
}n|
| jv r||
}n||
}|d usJ d|
 d||
| qS|jD ]V}|dkr|| }t	 }d	d
 tt|jD }||_
||_j || ||| q| v sJ  | }||}|| }t	 }||_
||_j || ||| q||_|j|_|j|_|j|_dd }|jtv r|||| j || d S )NconcatrM   r   r   r   r   r   z@EMPTY@c                 S   r   r   r$   r   r$   r$   r%   r     r   zeCompleter.complete_backward_annotation.<locals>._complete_grad_op_with_forward_op.<locals>.<listcomp>c                 S   s  g }d }g }|j dkrTt|ddkrT|dd }||d t| | j}t| |dd  j}|dkrG|dkrFtt|d }n||krStt|| }n|j dkr|dd }||d |dd }ttt| | jt| | j }nU|j d	kr|dd }||d
 ||d t|	d}	tt|	}n+|j dkr|dd }||d ttt| | jd }n	t
dt| t|dkr||}
|D ]}|
| dkr|
| }|D ]}||}||g qqd S d S )NZmatmul_v2_gradzY@GRADr   zOut@GRADr	   r   Zelementwise_add_gradYZlayer_norm_gradz	Bias@GRADz
Scale@GRADbegin_norm_axisZlookup_table_v2_gradzW@GRADz$Backward Partial is not adapted for r1   )rT   r7   r   r   r   rE   r9   rD   r   r.   NotImplementedErrorrP   rr   Zget_output_dist_attrZ_set_partial_dims)r   r	  r  Zparam_gradsZactivation_gradZbroadcast_axis_indiesZact_ndimZ
param_ndimZ	param_varr!  Zactivation_grad_dims_mappingZaxisZpartial_dimZp_grad_nameZp_grad_dist_attrr$   r$   r%    infer_backward_op_partial_status  s~   




z{Completer.complete_backward_annotation.<locals>._complete_grad_op_with_forward_op.<locals>.infer_backward_op_partial_status)re   r  r   rG   rT   r   rr   r+   r   r   r=   r  r   r   r   rp   r   ro   r  rD   r7   rE   r^   r_   chunk_idr   r  )r
  r	  r   r  r  r  Zsplit_input_var_namer  r  Zoutput_var_dist_attrr  r  r  r  rv   r#  )r   rg   r$   r%   !_complete_grad_op_with_forward_op  s   







JzQCompleter.complete_backward_annotation.<locals>._complete_grad_op_with_forward_opr1   r   r   z,No backward procedure found in this program.z:first backward op should has only ONE output, but got [{}]r   r   r   r   r   r   r   r   r'   )+re   r   r   r   r   r   r   r   rT   r9   r   r   r   r7   rD   r   r[   r   rp   r  r   r=   rG   r  r  r   r$  ro   r   r  r+   r   r  r-   r.   r   blocksr  r  r   r@   r  rP   )-rg   r   r  r   r%  Zloss_opZfirst_backward_op_idxr   r#   r   r   r   r	  Zloss_grad_varZloss_varZloss_var_distr_attrrv   Zloss_op_dist_attrr  r  r
  Zgrad_sub_block_idZforward_sub_block_idZgrad_sub_blockZforward_sub_blockZsub_grad_opZsub_forward_opr  r  r  r  r  r  r  r   r$  Zpre_idxZpre_grad_opZinter_arg_nameZpre_op_dist_attrr  r  r  r  r  r$   )r   r   rg   r%   complete_backward_annotation~  sv  
 A








 z&Completer.complete_backward_annotationc                 C   sB  ddl m} | j}t| j}| j}d}tt|D ]}|| }t	|
dt	tjkrt|rQ|jtv rt }	t||	_|jD ]}
||
 }| j|}|	|
|j qG|jD ]%}|| }t }t||_dd |jD |_| j|| |	||j q^n||dd  }| j|}|dusJ |j}|j}|j}|jd	kr||d
  jdkr|||d
  dd  }| j|}|dusJ |j}||dd  }| j|}|st }||_||_|j|jkr||_nt|jd
kr|jd d
ksJ dd |jD |_| j|| t }	||	_|jD ]}
||
 }| j|}|	|
|j q|jD ]}|| }| j|}|	||j q5| j ||	 d|j!v rd|| j!v rt|dd
ksmJ dt|dd
ks{J d||dd  }||dd  }| j|}|dusJ | j|j}|dusJ | j|j}|dusJ t }	||	_|	|j"| |	|j"| |	|j"| ||dd  }|	|j"dd |jD  |	|j"dd |jD  |sd}t }t||_dd |jD |_| j|| |j#! D ]}|dv rqt|j#|dkr,qt|j#|d
ks9J ||j#|d  }t }d|v sUd|v sUd|v rtdg|_|	|j"dd |jD  |	|j"dd |jD  n||_|	|j"| |	|j"| d|vr||_| j|| q| j ||	 qqdS )zQComplete the annotation of vars and ops in the update phase for parallel program.r   r   Fr   c                 S   r   r   r$   r   r$   r$   r%   r   S  r   z8Completer.complete_update_annotation.<locals>.<listcomp>r   Ncastr   rK   r   c                 S   r   r   r$   r   r$   r$   r%   r     r   GradParamzOnly support one-to-one now.LearningRatec                 S   r   r   r$   r   r$   r$   r%   r     r   c                 S   r   r   r$   r   r$   r$   r%   r     r   Tc                 S   r   r   r$   r   r$   r$   r%   r     r   )r*  r)  r+  ZBeta1TensorZBeta2TensorZEpsilonTensorZBeta1PowZBeta2PowZ
SkipUpdater1   c                 S   r   r   r$   r   r$   r$   r%   r     r   c                 S   r   r   r$   r   r$   r$   r%   r     r   )$5paddle.distributed.auto_parallel.static.process_groupr   r   r9   r   r   r   rD   r7   r   r.   r   ZOptimizer   rT   r   r   r
   rG   r   re   r  r   r=   r   r   rE   r  r   r   r$  r   r  Zinput_namesrp   r+   )rg   r   r   world_ranksr   r   Zlearning_rate_completedr   r#   rx   Zin_nameZin_varZin_dist_attrZout_nameZout_varZout_dist_attrr  r  Zref_chunk_idr  r  paramZgrad_varZparam_dist_attrZlearning_varZvar_dist_attrr  r  Zinput_var_attrr$   r$   r%   complete_update_annotation(  sr  














 z$Completer.complete_update_annotationc           	      C   s   |du r	| j j}n|| j _i }|jD ];}|jD ]5}|j|j D ],}||}d|v r+q||vrK| j |}| j 	|}|j
|j
krK|j|_|j||j< qqqtt| | j _dS )zz
        NOTE(zhaoyingli): Temporary methods.
        This func is for completing the chunk_id attr for every var
        NZlod_tensor_blocking_queue)re   r   r   r&  r   r   r   Z_find_var_recursiver  r  rG   r$  rp   r7   r@   r   Z_num_model_chunks)	rg   r   Zvar_to_chunk_idblockr#   rp   rk   rx   rv   r$   r$   r%   _complete_var_chunk_id  s8   





z Completer._complete_var_chunk_idc                 C   sL   |du r	| j j}n|| j _d| j _| j   |   | j   | j   dS )a  
        fill default data parallel annotation for program with primitive operators.

        Arguments:
            serial_main_program: partial annotated serial_main_program.
        Returns:
            serial_main_program: completed annotated serial_main_program.
        NT)re   r   r   Z_is_initializedZ_init_dist_attr_for_program_init_global_mesh_for_programr   r   r   r$   r$   r%   complete_prim_annotation5  s   	


z"Completer.complete_prim_annotationc                 C   s   ddl m} | j}| jjjD ]b}|j D ]}| j|}|d us$J t	||j
_q|jD ]B}| j|}|d us<J t	||j
_t|dd}|d urpt|j
}	|D ]}
|
|}|
|rl|
j|j
_|
j|j
_ n|	|_
qTq.qd S )Nr   r   Tr|   )r,  r   r   re   r   r&  r   r   r   r
   r]   rG   r   Zget_dist_op_for_programr   r   r   r\   r   rT   r^   r   r_   )rg   r   r-  r0  r   r   r#   rU   r   r   r   r   r$   r$   r%   r2  J  s8   




z'Completer._init_global_mesh_for_program)Tr?   )__name__
__module____qualname__ri   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r'  r/  r1  r3  r2  r$   r$   r$   r%   rd      s8    

^{##$<  O
*!
O
N 
U   - 
h
&rd   )3r   loggingrN   r)   Zpaddle.base.corer   r   r   Zpaddle.base.frameworkr   Zpaddle.base.log_helperr   Z/paddle.distributed.fleet.meta_optimizers.commonr   Zpaddle.frameworkr   rG   r
   r   Zdist_attributer   r   rh   r   Zoperators.commonr   r   r   Zprocess_groupr   utilsr   r   r   r   r   r   r4  INFOrY   r~   r"   r&   r0   r6   r>   rC   rI   rX   rc   rd   r$   r$   r$   r%   <module>   s:    	"