o
    "j,F                    @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZmZ d dlmZ ddlmZ d	d
lmZmZmZ ejjZej Zejjjejjj ejjj!ejjj"ejjj#gZ$dgZ%g dZ&dddZ'dd Z(dd Z)dd Z*dd Z+dd Z,dd Z-dd Z.dd Z/d d! Z0d"d# Z1d$d% Z2d&d' Z3d(d) Z4dd*d+Z5dd,d-Z6d.d/ Z7d0d1 Z8d2d3 Z9d4d5 Z:d6d7 Z;d8d9 Z<dd:d;Z=d<d= Z>d>d? Z?d@dA Z@dBdC ZA		D	ddEdFZBdGdH ZC	ddIdJZDdKdL ZEdMdN ZFdOdP ZGdQdR ZHdSdT ZIddUdVZJdWdX ZKdYdZ ZLd[d\ ZMd]d^ ZNd_d` ZOdadb ZPdcdd ZQdedf ZRdgdh ZSdidj ZTdkdl ZUdmdn ZVdodp ZWdqdr ZXdsdt ZYdudv ZZdwdx Z[dydz Z\d{d| Z]d}d~ Z^dd Z_dd Z`dd Zadd ZbG dd dZcdd Zddd Zedd Zfdd Zgdd Zhdd Zidd Zjdd Zkdd Zldd Zmdd Zndd Zodd Zpdd Zqdd Zrdd Zsdd Ztdd Zudd Zvdd Zwdd Zxdd Zydd Zzdd Z{	D	D	dddZ|		D	D		DdddZ}dd Z~dd ZeeZdd ZddĄ ZddƄ Zdededefdd˄Zddd΄ZdS )    N)reduce)wrap_decorator)core)is_belong_to_optimizeris_parameter)Variable   ProcessMesh   )DistTensorSpecOperatorDistAttrTensorDistAttrZ	expand_v2)sumsqrtZfill_constantZelementwise_maxelementwise_divstack
reduce_sumauto_parallelc                 C   sX   t |}d|_|js%||  t  }t d}|| || |S ||  |S )NFz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)	logging	getLogger	propagatehandlerssetLevelStreamHandler	FormattersetFormatter
addHandler)Z	log_levelnameloggerZlog_handlerZ
log_format r    n/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/auto_parallel/static/utils.py
get_logger8   s   




r"   c                 C   s"   |t |  kr|t | k rdS dS )NTF)len)listindexr    r    r!   is_valid_list_indexH   s   r&   c                 C   s   | dkrdS dS NTFr    mappingr    r    r!   is_dim_shardO      r+   c                 C   s   | dkrdS dS r'   r    r)   r    r    r!   is_dim_replicateV   r,   r-   c                 C   s   | d u rdS t dd | D sdS tt| D ]}| | dk s(| | t|jkr+ dS qtt|jD ]}| |dkr? dS q3dS )NFc                 s   s    | ]}t |tV  qd S N)
isinstanceint).0dr    r    r!   	<genexpr>`   s    z&verify_dims_mapping.<locals>.<genexpr>r(   r   T)allranger#   shapecount)dims_mappingprocess_meshir    r    r!   verify_dims_mapping]   s   r;   c                 C   sZ   g }| D ]&}|d u r| d q|j|j| dkr!| d q| |j| q|S )Nr(   r   )appendr6   	dim_namesr%   )
shard_specr9   r8   shardr    r    r!   convert_to_dims_mappingk   s   r@   c                 C   s6   g }| D ]}|dkr| d  q| |j|  q|S )Nr(   )r<   r=   )r8   r9   r>   dim_mappingr    r    r!   convert_to_shard_specw   s   rB   c                 C   s   t | t |kr
dS | D ]}|d urt|ts dS |d ur&||jvr& dS qt| |}t||s3dS tt |D ]}|| dkrW|| dkrW|| |j||   dkrW dS q9dS )NFr(   r   T)r#   r/   strr=   r@   r;   r5   r6   )r>   tensor_shaper9   r?   r8   r:   r    r    r!   verify_shard_spec   s$   

rE   c                 C   sD   | sd S | d }| D ]}|dkr|}q
|dkrq
||krq
 d S |S )Nr   r(   r    )dim_mappingsZcompatible_mappingr*   r    r    r!   compute_compatible_dim_mapping   s   rG   c                 C   s|   | sd S t | d }| D ]}|d usJ dt ||ks J dqg }t|  D ]}tt|}|d u r6 d S || q'|S )Nr   z8Dims mapping must not be None for compatible computationzKThe length of dims_mapping in list must be same for compatible computation.)r#   ziprG   r$   r<   )dims_mapping_listlengthr8   Zcompatible_resultrF   compatible_dim_mappingr    r    r!   compute_compatible_dims_mapping   s&   
rL   c                 C   s>   d }| s|S | D ]}|d ur|d u s||kr|}q d S q|S r.   r    )Zprocess_mesh_listZcompatible_process_meshr9   r    r    r!   compute_compatible_process_mesh   s   rM   c                 C   s   t | t |ks
J d}g }tt | D ]}t| | || s!J || | ||   qt|}|d u r7dS tt | D ]}|| | ||  krS|| | || < d}q=|S )NFT)r#   r5   r&   r<   rG   )rI   Z
index_listchangedrF   r:   rK   r    r    r!   )compute_compatible_and_update_dim_mapping   s   rO   c                 C   s   | t   S )zE
    Append auto parallel suffix for distributed attribute name.
    )r   kAutoParallelSuffixr   r    r    r!   append_distributed_attr_suffix   s   rR   c                 C   s   |  t S )zF
    Remove auto parallel suffix from distributed attribute name.
    )stripr   rP   rQ   r    r    r!   remove_distributed_attr_suffix   s   rT   c           
      C   s   ddl m} |d u r| }| sJ d| jD ]<}|j D ]}||}||}|d ur7| s7  dS q|j	D ]}|
|}||}	|	d urS| sS  dS q;qdS )Nr   get_default_distributed_contextz8Distributed attributes must be initialized before check.FT)dist_contextrV   Zis_initialized_for_programblocksvarsvaluesZget_dist_tensor_for_graph get_tensor_dist_attr_for_programZis_validopsZget_dist_op_for_graphget_op_dist_attr_for_program)
programrW   rV   blockZtensordist_tensortensor_dist_attropdist_opop_dist_attrr    r    r!   "check_distributed_attr_for_program   s0   




re   c                 C   sj   t  }|  ddlm}m} |du r| }t| dd n| }|| t| dd || |  dS )z
    This function reuses the original program output ability with a distributed context.
    Using lock can avoid multiple threads change the default distributed context simultaneously.
    r   )rV   set_default_distributed_contextNT)flush)	threadingLockacquirerW   rV   rf   printrelease)r^   rW   lockrV   rf   Zoriginal_default_contextr    r    r!   print_program_with_dist_attr  s   rn   c           	         s   |v sJ d| d  |}t|  fddt| D }t| D ]}||| |< q+fdd|D }fdd|D }t|S )a  
    Given a rank and the processes mesh the rank belongs to,
    compute the communication peers of the rank based on the give axis in the mesh.

    Example: 16 processes managed in a 4-Dimensional mesh with shape of [2, 2, 2, 2].
    the rank communication peers of rank 0 (included) are following:
    in axis 0: [0, 1]
    in axis 1: [0, 2]
    in axis 2: [0, 4]
    in axis 3: [0, 8]
    zrank [z] is NOT in processes group c                    s   g | ]} d d  qS r.   r    r1   r:   )
coordinater    r!   
<listcomp>.      z#_get_comm_group.<locals>.<listcomp>c                    s   g | ]}t  |qS r    )_coordinate2linear_idx)r1   rp   )r6   r    r!   rq   4  s    c                    s   g | ]} | qS r    r    )r1   idx)	processesr    r!   rq   8      )r%   _linear_idx2coordinater5   sorted)	ru   r6   axisrankrank_relatvieZcoordinates_in_groupr:   Zranks_in_group_relativeZranks_in_groupr    )rp   ru   r6   r!   _get_comm_group  s   



r|   c                 C   s   |  |}t||}|| S )a  
    Given a rank and the processes mesh the rank belongs to,
    compute the index of the rank in given axis.

    Example: 27 processes managed in a 3-Dimensinal mesh with shape of [3, 3, 3].
    the index of rank 22 are:
    in axis 0: 1
    in axis 1: 1
    in axis 2: 2
    )r%   rw   )ru   r6   ry   rz   r{   rp   r    r    r!   _get_idx_in_axis=  s   

r}   c                 C   s   t | t |ksJ d| |tt | D ]#}|| dks(J d| d| || | | k s9J d|| |q| d }|d }tt | d ddD ]}||||  7 }|| | 9 }qL|S )a  
    convert a coordinate in multidimensional mesh space into a scala idx in linear space.

    it use Row-major order for dimension conversion.
    so it has:  [most_significant_dim, ..., least_significant_dim]
    assume:

        the size of i-th dimension to be:  S[i]
        the index of j-th dimension is: I[j]

    linear_idx of a n dimensional coordinate is:

        I[n-1] * (S[n-2] * S[n-3] * S[n-4] *     ....    S[0]) +
        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +
        I[n-3] * (                  S[n-4] *     ....    S[0]) +
        ...
        I[1]   * (                                       S[0]) +
        I[0]

    zUcoordinate should have the same size as mesh shape, but got shape: {}, coordinate: {}r   zindex in dimension [z"] is least than zero. coordinate: z@index beyond extent in dimension [{}]. shape: {}, coordinate: {}r(   r   )r#   formatr5   )
mesh_shaperp   r:   base
linear_idxr    r    r!   rs   P  s,   rs   c                 C   s   |dksJ d| d|t | k sJ d| |d}dgt|  }ttt| D ]}|| }t|| |  ||< || | 9 }q,|S )a	  
    mapping a linear scala into multidimensional mesh space, return it coordinate in that space.

    it is the inverse function of _coordinate2linear_idx.
    assume:

        the size of i-th dimension to be:  S[i]
        the index of j-th dimension is: I[j]

    the coordinate given linear_idx is:

        I[0] = linear_idx                                  % S[0]
        I[0] = (linear_idx / S[0])                         % S[1]
        I[0] = (linear_idx / (S[0] * S[1]))                % S[2]
        ....

    r   zlinear index [z] is least than zerozIlinear index beyond the extent of mesh shape. shape: {}, linear index: {}r   r(   )npprodr~   r#   reversedr5   r0   )r   r   r   rp   r:   offsetr    r    r!   rw     s   rw   c                 C   sb   d }| j D ]}||jv r|j|jkrt|j|j|} nq|d ur,|jt|j| S |jd S Nr   )process_meshesprocess_idsr6   rw   r%   rs   )rW   Ztarget_meshrz   rp   meshr    r    r!   _get_corresponding_rank  s   


r   c                 C   s   | j }|j}|jj }t|t|ksJ d| d| dg }tt|D ]#}|| dks3|| dkr;|||  q%||| |||    q%|S )Nzvariable shape [z] and dim_mapping [z] is NOT match !r(   )r6   r8   r9   r#   r5   r<   )var	dist_attrZ	var_shaper*   r   Z	new_shapert   r    r    r!   _get_unshard_dist_shape  s   r   c                 C   sx   ddl m} |d u r| }|  D ](}|jr9||}t||}|j| |j}dgt	| }||_|
|| qd S )Nr   rU   r(   )rW   rV   	list_varsZis_datar[   r   desc	set_shaper8   r#    set_tensor_dist_attr_for_program)Zdist_main_progZdist_startup_progrW   rV   r   ra   Zinverse_shaperA   r    r    r!   make_data_unshard  s    
r   c                 C   s   dddd}| s
|S t | tstdtt|  d|  D ]'\}}|dvr1tdt| dt |tsBtdtt| d|||< q|S )z(Update default addition_info with inputsr   )epochbatchZ
batch_sizez7The type of 'addition_info' should be 'dict', but got ''.z[The key of 'addition_info' should be one of the ['epoch', 'batch', 'batch_size'], but got 'z7The value of 'addition_info' should be 'int', but got ')r/   dict	TypeErrorrC   typeitems
ValueErrorr0   )addition_infoZadd_infoitemvaluer    r    r!   _update_addition_info  s0   




r   c                 C   sv   | s| S t | tr/| D ]!}t |tstdtt| dtj|s,td| dq| S tdtt|  d)z!Validity check of input file pathz0The type of file path should be 'str', but got 'r   zThe file path 'z' does not exist.z1The type of file path should be 'list', but got ')	r/   r$   rC   r   r   ospathexistsr   )	file_pathfiler    r    r!   _check_valid_path  s&   



r   c                 C   s   | st dt| tstdtt|  d|  D ](\}}t|ts0tdtt| dt|tjj	sCtdtt| dq| S )Nz'param_dict' cannot be None.z4The type of 'param_dict' should be 'dict', but got 'r   z:The type of key of 'param_dict' should be 'str', but got 'zBThe type of value of 'param_dict' should be 'LoDTensor', but got ')
r   r/   r   r   rC   r   r   paddler   	LoDTensor)
param_dictr   r   r    r    r!   _check_param_dict  s.   




r   c                 C   s   | s| S t | tstdtt|  d|  D ]>\}}t |ts.tdtt| dt |ts?tdtt| dg d}t| |krWtdt|  dq| S )	Nz3The type of 'dist_attr' should be 'dict', but got 'r   z@The type of param name of 'dist_attr' should be 'str', but got 'z=The type of distributed attribute should be 'dict', but got ''process_shapeprocess_groupr8   ziThe key of distributed attribute should be '['process_shape', 'process_group', 'dims_mapping']', but got .)	r/   r   r   rC   r   r   r$   keysr   )r   r   r   attrr    r    r!   _check_dist_attr.  s<   






r   Fc                 C   sj   ddl m} t| tjjsJ t|tsJ |du r| }t|}|s1t| || t	| || dS t
d)a1  
    Save model parameter state, optimizer state, distributed attribute and
    additional information of each rank.

    Args:
        program(Program): The program to be saved.
        checkpoint_path(str): The path of the checkpoint file to be saved.
        dist_attr_path(str): The path of distributed attribute file to be saved.
        addition_info(dict, optional): Additional information, key should be selected in ['epoch', 'batch', 'batch_size'].
            Default values are 0, when 'addition_info' is None. Default: None.
        is_integrated(bool, optional): Whether to integrate param before save. Default: False.
        dist_context(DistributedContext ,optional): collect related distributed information for program

    Returns:
        None

    Examples:
        .. code-block:: python

            >>> import os
            >>> from paddle.distributed.auto_parallel.static.utils import save_distributed_checkpoint

            >>> step = 16000
            >>> global_batch_size = 32
            >>> path = os.path.join("./output", "step_%d" % step)
            >>> os.makedirs(path, exist_ok=True)
            >>> program = paddle.static.Program()

            >>> add_info = {'batch': step, "batch_size": global_batch_size}
            >>> save_distributed_checkpoint(program, path, path, add_info)

    r   rU   Nz/Integrating parameter has not been implemented.)rW   rV   r/   r   staticProgramboolr   _save_distributed_state_dict_save_distributed_attributeNotImplementedError)r^   checkpoint_pathdist_attr_pathr   Zis_integratedrW   rV   r    r    r!   save_distributed_checkpointL  s   (r   c                 C   sJ   t | sJ dt |sJ dt| }t|}|d }|d }|||fS )a  
    Load parameter, optimizer, distributed attribute and addition_info.

    Args:
        checkpoint_path(list[str]): model parameter file path, must be in order of rank id.
        dist_attr_path(list[str]): distributed attribute file path, must be in order of rank id.

    Returns:
        param_dict(dict): parameters' value of all ranks.
        dist_attr(dict): parameters' distributed attribute.
        addition_info(dict): additional information user saved in last training.

    Notes:
        The return, 'addition_info', is belonging to the first file of checkpoint_path by default.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('Depends on external files.')
            >>> from paddle.distributed.auto_parallel.static.utils import load_distributed_checkpoint

            >>> ckpt_path = [
            ...     './model_state_rank0.pdmodel',
            ...     './model_state_rank1.pdmodel',
            ... ]
            >>> dist_attr_path = [
            ...     './dist_attr_rank0.pdattr',
            ...     './dist_attr_rank1.pdattr',
            ... ]
            >>> param_dict, dist_attr, add_info = load_distributed_checkpoint(ckpt_path, dist_attr_path)
    !'checkpoint_path' cannot be None. 'dist_attr_path' cannot be None.modelr   )r   _load_distributed_state_dict_load_distributed_attribute)r   r   state_dict_infor   r   r   r    r    r!   load_distributed_checkpoint  s    
r   c                 C   s   ddl m} t|tjjsJ t| sJ dt|sJ d|du r&| }t| }t|}t	||}|d }|d }	t
|||}
t|
| |	S )a  
    Load parameter, optimizer, distributed attribute and addition_info into model.

    Args:
        checkpoint_path(list[str]): model parameter file path, must be in order of rank id.
        dist_attr_path(list[str]): distributed attribute file path, must be in order of rank id.
        program(Program): the program to be updated with checkpoint_path.
        dist_context(DistributedContext ,optional): collect related distributed information for program

    Returns:
        addition_info(dict): user saved in last train.

    Notes:
        The return, 'addition_info', is belonging to the first file of checkpoint_path by default.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('Depends on external files.')
            >>> from paddle.distributed.auto_parallel.static.utils import load_checkpoint_into_program

            >>> exe.run(startup_program)
            >>> ckpt_path = [
            ...     './model_state_rank0.pdmodel',
            ...     './model_state_rank1.pdmodel',
            ... ]
            >>> dist_attr_path = [
            ...     './dist_attr_rank0.pdattr',
            ...     './dist_attr_rank1.pdattr',
            ... ]
            >>> load_checkpoint_into_program(ckpt_path, dist_attr_path, main_program)
    r   rU   r   r   Nr   r   )rW   rV   r/   r   r   r   r   r   r   get_dist_attrmerge_and_slice_parameterload_parameter_into_program)r   r   r^   rW   rV   all_state_dict_infoZall_pre_dist_attrZall_cur_dist_attrZall_param_dictr   Zsliced_param_dictr    r    r!   load_checkpoint_into_program  s(   #

r   c                 C   s:   t | tsJ |rt |tjjsJ | sdS ||  dS )z
    Load parameters into program.

    Args:
        param_dict(dict): parameters' name and value.
        program(Program): the program to be updated
    N)r/   r   r   r   r   Zset_state_dict)r   r^   r    r    r!   r     s
   r   c                 C   sX   t j }tj|d| d}t| |t j d}t || t	
d| d dS )z,Save distributed attribute of all parametersZdist_attr_rankz.pdattr)r   
world_sizez(Already saved distributed attribute to 'r   N)r   distributedget_rankr   r   joinr   get_world_sizesaver   info)r^   r   rW   rank_idZdist_attr_nameZdist_attr_dictr    r    r!   r     s   
r   c                 C   s^   i }| D ](}t |}|d }|t| ksJ d|d  D ]\}}||vr+|||< qq|S )z:Load parameters' distributed attribute from dist_attr_pathr   zMThe number of 'dist_attr_path' must be equal to the last training world size.r   )r   loadr#   r   )r   Ztotal_dist_attrZdist_attr_filer   pre_world_sizer   r   r    r    r!   r   	  s   
r   c                 C   sX   t j }tj|d| d}|  t j |d}t || t	
d| d dS )zSave parameters' state_dictZmodel_state_rankz.pdmodel)r   r   r   zAlready saved model to 'r   N)r   r   r   r   r   r   
state_dictr   r   r   r   )r^   r   r   rz   Zckpt_file_namer   r    r    r!   r     s   
r   c           
      C   s   i }t | D ]C\}}tj|dd}|d }|t| ksJ d|dkr'|d }|d  D ]\}}||v r@|| t| q-t|g||< q-q||d}	|	S )	z0Load parameters' state_dict from checkpoint_pathT)Zreturn_numpyr   zNThe number of 'checkpoint_path' must be equal to the last training world size.r   r   r   )r   r   )	enumerater   r   r#   r   r<   r   array)
r   Zall_state_dictrt   Z	ckpt_filer   r   r   r   r   r   r    r    r!   r   (  s(   r   c                 C   s~   ddl m} t| tjjsJ |du r| }i }|  D ] }t|s&t|r<|	|}|j
}|j}|j|j|d||j< q|S )zs
    Get distributed attribute of current rank.

    Args:
        program(Program): main program for training
    r   rU   Nr   )rW   rV   r/   r   r   r   r   r   r   r[   r9   r8   r6   r   r   )r^   rW   rV   r   r   ra   r9   r8   r    r    r!   r   @  s$   r   c                 C   s  t |sJ dt| tsJ dtt| |  D ]'\}}t|ts1tdtt| dt|tr?t	dd |D sCtdq|du rJi S g }g }t
d	 | D ]n}||vrc|| qW|| }|| }	||	krtj }
|	d
 |
}| | | }|| |< qW| | }|d }|	d }tt|dksd|vrt||}|| |< n|d }|| |< tt|dksd|vrt||	}|| |< qW|D ]}||vr|| | | q|rtdt| |rtdt| | S )a  
    Merge parameters with previous dist_attr and slice parameters with current dist_attr

    Arags:
        dist_param_dict(dict): parameters' value of all ranks.
        pre_dist_attr(dict): parameters' dist_attr of last training process.
        cur_dist_attr(dict): parameters' dist_attr of current training process.

    Returns:
        dist_param_dict(dict): parameters' value of current rank.
    z'pre_dist_attr' cannot be None.z;The type of 'dist_param_dict' should be 'dict', but got {}.zXThe key of 'dist_param_dict' is parameter's name, and its type should be 'str', but got r   c                 s   s    | ]	}t |tjV  qd S r.   )r/   r   ndarray)r1   vr    r    r!   r3   t  s    
z,merge_and_slice_parameter.<locals>.<genexpr>zoThe value of 'dist_param_dict' is parameter's value of all ranks, and its type should be 'list(numpy.ndarray)'.Nz$Start to merge and slice parameters.r   r8   r   r(   r   z7Parameters '{}' are not found in last training process.z:Parameters '{}' are not found in current training process.)r   r/   r   r~   rC   r   r   r   r$   r4   r   r   r   r<   r   r   r   r%   r#   set_merge_parameter_with_dist_attr_slice_parameter_with_dist_attrpopwarningswarn)Zdist_param_dictZpre_dist_attrZcur_dist_attrr   r   Zparam_not_in_preZparam_not_in_curvar_nameZpre_attrZcur_attrr   r%   paramZ	pre_paramZpre_dims_mappingZcur_dims_mappingcomplete_paramsliced_paramr    r    r!   r   \  s   








r   c                 C   s   ddl m} |d }|d }|d }|| d j||}g }g }|D ]"}	||	||||}
||	}|
|vrD||
 t|| | |
| q"t|dksQ|rQJ d|d d }|S )z*Merge parameter with distributed attributer   	Resharderr8   r   r   r   zFail to merge parameter)	reshardr   Zcompute_complete_shaper6   compute_partition_indexr%   r<   _merge_parameterr#   )Z
param_listr   r   r8   r   r   complete_shapepartition_param_listZmerged_partitonprocesspartition_indexr%   r   r    r    r!   r     s8   


r   c           
      C   s|   t | tjjrt| n| } |d }|d }|d }t| j|||}t| |t	|}tj
 }t|| j|||}|| }	|	S )z*Slice parameter with distributed attributer8   r   r   )r/   r   r   r   r   r   _get_split_indicesr6   _slice_parameterr#   r   r   _get_sliced_param_index)
r   r   r8   r   r   partition_index_listsliced_param_listr   sliced_param_indexr   r    r    r!   r     s"   


r   c                 C   s   ddl m} t| dkr1d}t| d d D ]\}}|d dks(|d || kr,d} nq|r1dS | s<| ||f dS d}|t| k r|| | d |\}	}
}|	dkr|
dkrgtj| | d |f|	d}ntj|| | d f|	d}| | t	| ||| dS |d7 }|t| k sDdS dS )	a  
    Merge partitial parameters to a complete one.

    Returns:
        None

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _merge_parameter

            >>> partition_param_list = [(np.array([[[1.11, 1.12]]]), [[0, 1],[0, 1],[0, 2]])]
            >>> param = np.array([[[1.13, 1.14]]])
            >>> partition_index = [[0, 1],[0, 1],[2, 4]]
            >>> complete_shape = [2, 2, 4]

            >>> _merge_parameter(partition_param_list, param, partition_index, complete_shape)
            >>> print(partition_param_list)
            [(array([[[1.11, 1.12, 1.13, 1.14]]]), [[0, 1],[0, 1],[0, 4]])]

    r   r   Tr   FNr(   ry   )
r   r   r#   r   r<   Zcompute_concat_infor   Zconcatenater   r   )r   r   r   r   r   Zis_complete_datart   r   r:   Zconcat_axisZfirst_orderZnew_partition	new_paramr    r    r!   r     sN   
r   c                 C   sV   g }t | j| }tj| || |d}|dkr|S |D ]}|t|||d  q|S )a  
    Slice a complete parameter.

    Returns:
        sliced_param_list(list): sliced parameters with 'partition_index_list'

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _slice_parameter

            >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
            >>> rank = 2
            >>> complete_shape = [1, 1, 6]
            >>> dims_mapping = [-1, -1, 0]
            >>> process_shape = [3]
            >>> process_group = [0, 1, 2]

            >>> sliced_param_list = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
            >>> print(sliced_param_list)
            [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

    r   r   )r#   r6   r   splitextendr   )r   r   rJ   r   ry   r   r   r    r    r!   r   8  s   
r   c                 C   s   ddl m} || ||||}d}t|D ]2\}}	|| dkr"|	}
n|	|||   }
|
dkr5|| d }n
|| d d |
 }||	|
  | }q|S )a  
    Get sliced_param's index of current rank in all sliced parameters list.

    Returns:
        sliced_param_index(int): the index of sliced param in sliced_param_list

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _get_sliced_param_index

            >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
            >>> rank = 2
            >>> complete_shape = [1, 1, 6]
            >>> dims_mapping = [-1, -1, 0]
            >>> process_shape = [3]
            >>> process_group = [0, 1, 2]

            >>> slice_param = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
            >>> print(slice_param)
            [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

            >>> index = _get_sliced_param_index(rank, complete_shape, dims_mapping,
            ...                                 process_shape, process_group)
            >>> print(index)
            2
    r   r   r   r(   )r   r   r   r   )rz   r   r8   r   r   r   r   r   r:   r6   Zslice_shaper%   r    r    r!   r   _  s   
r   c           	      C   s   ddl m} g }|D ]"}||| |||}|r*tt|D ]}|| ||  qq
|}q
ttdd || }dd |D }|S )a  
    Get split indices of every dimension.

    Returns:
        split_indices_list(list): the split indices of every dimension of the parameter

    Examples:
        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.distributed.auto_parallel.static.utils import _get_split_indices

            >>> complete_param = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]])
            >>> complete_shape = [1, 1, 6]
            >>> dims_mapping = [-1, -1, 0]
            >>> process_shape = [3]
            >>> process_group = [0, 1, 2]

            >>> index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group)
            >>> print(index)
            [[], [], [2, 4]]
    r   r   c                 S   s   t t| |h dh S r   )r$   r   xyr    r    r!   <lambda>  rr   z$_get_split_indices.<locals>.<lambda>c                 S   s   g | ]}t |qS r    )rx   r1   r   r    r    r!   rq     rv   z&_get_split_indices.<locals>.<listcomp>)r   r   r   r5   r#   r   r$   map)	r   r8   r   r   r   Zsplit_indices_listr   r   dimr    r    r!   r     s(   
r   c                 C   s4   t | d}t| jv o|t tjkp|t tjkS )Nop_role)r0   r   OP_ROLE_KEY
attr_namesOpRoleForwardLossrb   r   r    r    r!   is_forward_op  s   
r   c                 C   $   t | jv ot|  t  ttj@ S r.   )r   r   r0   	all_attrsr   Backwardrb   r    r    r!   is_backward_op  
   
r  c                 C   r  r.   )r   r   r0   r  r   Optimizer  r    r    r!   is_optimize_op  r  r  c                 C   s&   t | jv ot|  t  ttjj@ S r.   )r   r   r0   r  r   r  ZLRSchedr  r    r    r!   is_lr_sched_op  s
   

r	  c                 C   s.   t | jv ot|  t  ttjttjB kS r.   )r   r   r0   r  r   r   r   r  r    r    r!   
is_loss_op  s
   
r
  c                 C   s:   t | jvrdS t|  t  }|ttj@ o|ttj@ S )NF)r   r   r0   r  r   r  r   r   r    r    r!   is_loss_grad_op  s   
r  c                 C   s   | j do| j ddS )Nop_namescopez/gradient_clip)r   has_attrr   
startswithr  r    r    r!   is_gradient_clip_op  s
   r  c                 C   s   | j dod| j dv S )Nr  z/auto_parallel/reshard)r   r  r   r  r    r    r!   is_reshard_op  s
   r  c                 C   s   | j dS )N_p)r   endswithr  r    r    r!   
is_prim_op  s   r  c                 C   s
   |  dS )Nring_id)r  r  r    r    r!   
is_comm_op  s   
r  c                 C   sX   g }| j D ]}t|rt|j dksJ d|| qt|dks(J d|d S )Nr   z#loss op should only output loss varz"num of loss op is not equal to oner   )r\   r
  r#   r   output_arg_namesr<   )r_   Zloss_opsrb   r    r    r!   get_loss_op  s   

r  c                 K   s   t  }||_t|ttjfrt||_nt|tjr||_n
t	d
|t|d|v r:|d r:|d |d d|v rG|d rG|d |_| || |S )Nz<{} must be a instance of ProcessMesh or list, but receive {}mark_annotatedr8   r9   chunk_id)r   r8   r/   r$   r   r   r
   r9   r   r   r~   r   r  r  r   )rW   r   r8   r9   kwargsra   r    r    r!   set_var_dist_attr  s$   


r  c                 K   s   |d usJ |d usJ t  }| j D ]}||| q| j D ]}||| q"||_d|v r;|d r;|d |_|| | d S )Nr  )	r   r   input_arg_namesset_input_dims_mappingr  set_output_dims_mappingr9   r  set_op_dist_attr_for_program)new_opr9   Zref_mappingctxr  new_op_dist_attrinput_varnameoutput_varnamer    r    r!   6naive_set_dist_op_attr_for_program_by_mesh_and_mapping  s   
r%  c           	      K   s   |d usJ t  }| j D ]}| j|}||j}||| q| j D ]}| j|}||j}|	|| q(||_
d|v rI|d |_d|v rR|d |_|| | d S )Nis_recomputer  )r   r   r  r_   r   r[   r8   r  r  r  r9   r&  r  r  )	r   r9   r!  r  r"  r#  r   r*   r$  r    r    r!   *naive_set_dist_op_attr_for_program_by_mesh2  s    

r'  c                 C   s  d}| j }| jj}| dks| dkrdS | }g }d|v r&|d}g }| D ]?}| |}|jr7q,|	|}	t
|	dkr^t|	dd  D ]\}
}|dks]J d| |
|qJt
|	dkrk||	d  q,| D ]}}| |}|jr{qp||}	||vrt
|	dkrt|	dd  D ]\}
}|dksJ d| |
|qt
|	dkr||	d  qp|	d dksJ d	| |t
|	d
krt|	d
d  D ]\}
}|dksJ d| |
|q||	d  qpt|}|d usJ d| D ]%}| |}|jr
q|	|}	t
|	dkr#||	d kr#||	d< d}q| D ];}| |}|jr5q(||}	||vrUt
|	dkrS||	d krS||	d< d}q(||	d krb||	d< d}q(|S )NFr6   sliceZXShaper   r(   z_{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part.r   zh{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part.r   zi{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part.#There is no compatible dim mapping.T)r   	serial_opr   r   output_namesoutputr  Zget_serial_inputr   get_input_dims_mappingr#   r   r~   r<   r  Zget_serial_outputget_output_dims_mappingrG   )rc   rN   rd   op_descr+  Zxshape_arg_namesZbatch_dim_mappingsarg_nameserial_tensorr8   rt   r*   rK   r    r    r!   +update_op_dims_mapping_by_default_dist_implJ  s   
















r2  c                 C   s  d}| j }| jj}| }i }i }d}|D ]}||}	|t|	k r&t|	}|	||< t|	||< qg }
|D ]7}|| |k redd t|D }t|| D ]}|||  | }|| | ||< qL|
| q5|
||  q5| }|D ]}|	|}	t|	|ksJ |
|	 qst
|
}|d usJ d|D ]G}|| |k rdd t|| D }t|| D ]}|||  | }|| ||< q||| kr||| d}q||| kr||| d}q|D ]}|	|}	||	kr||| d}q|S )NFr(   c                 S      g | ]}d qS r(   r    r1   _r    r    r!   rq     s    zHupdate_op_dims_mapping_by_elementwise_like_dist_impl.<locals>.<listcomp>r)  c                 S   r3  r4  r    r5  r    r    r!   rq         T)r   r*  r   r  r-  r#   r5   r<   r  r.  rL   r  r  )rc   rN   rd   r/  r  Zinput_dims_mapping_dictZinput_dims_mapping_lensZmax_dims_mapping_lenr0  r8   rI   Znew_dims_mappingr:   Znew_idxr  Zcompatible_dims_mappingr    r    r!   4update_op_dims_mapping_by_elementwise_like_dist_impl  s|   






r8  c                 C   s   ddl m} | j}t|}g }|du rtj nt|	d}t
|D ]}t|}	| |	_|||	\}
}
}}}
|| q$|S )z2Get all distributed main programs by dist_context.r   )DistributedOperatorContextNZGPU)rW   r9  clustercopydeepcopyr   r   r   r#   Zget_all_devicesr5   Z_dist_op_contextZ_get_dist_programr<   )Zserial_program_inforW   Zparallelizerr9  r:  Zcopied_parallelizerZall_dist_main_programranksr   Zused_dist_contextr6  Zdist_startup_programZdist_main_programr    r    r!    get_all_distributed_main_program  s(   



r>  c                   @   sT   e Zd Z	dddZedd Zedd Zedd	 Zed
d Zedd Z	dS )SerialProgramInfoNc                 C   s"   || _ || _|| _|| _|| _d S r.   )_train_program_startup_program_loss
_optimizer_cluster)selftrain_programZsatrtup_programloss	optimizerr:  r    r    r!   __init__  s
   
zSerialProgramInfo.__init__c                 C      | j S r.   )r@  rE  r    r    r!   rF       zSerialProgramInfo.train_programc                 C   rJ  r.   )rA  rK  r    r    r!   startup_program  rL  z!SerialProgramInfo.startup_programc                 C   rJ  r.   )rB  rK  r    r    r!   rG    rL  zSerialProgramInfo.lossc                 C   rJ  r.   )rC  rK  r    r    r!   rH    rL  zSerialProgramInfo.optimizerc                 C   rJ  r.   )rD  rK  r    r    r!   r:    rL  zSerialProgramInfo.clusterr.   )
__name__
__module____qualname__rI  propertyrF  rM  rG  rH  r:  r    r    r    r!   r?    s    
	



r?  c                 C   s  dd }dd l m} | }|  d}ddddd	d
dd}g }g d}| D ]}i }	| j}
| jD ]}d}|j|v rE||	|j	 < q4|j
rRt|
|j
d  jnd}t|dttjkrd|jv r|jd d }|| v rv|| }|j|d|d}|r||||
}n:|j||d}|rd||||
 }n(t|dttjkr|j| v r||j n|j}||}|r||||
}||	|j	 < q4||	 q&|S )Nc                 S   st  d}zt | d }W n   | Y S | d }d}d}|d}d}|D ]}	d|	v r+dnd}||	v r|	d |	|d  }
|	d}|	d	}|dkrR|dkrR||ksVJ d
|	|d | d}dd |D }d}|tdd |d7 }|jdkr|
dkrdnd}
|jD ] }| |
kr||D ]}|| }|tdd |j7 }q nqq#|dkr|dksJ d|| | }|S )Nr   Zop_timeconfig
z
(Variable)z(list<Variable>r   []zGet shape failed.,c                 S   s   g | ]}t | qS r    )r0   rS   r   r    r    r!   rq   ?  rr   zFget_standalone_cost_data.<locals>._compute_runtime.<locals>.<listcomp>c                 S      | | S r.   r    r   r    r    r!   r   A      zDget_standalone_cost_data.<locals>._compute_runtime.<locals>.<lambda>c_embeddingweightwZidsc                 S   rW  r.   r    r   r    r    r!   r   K  rX  zGet input size failed.)	floatr   findr   r   input_nameslowerinputr6   )op_costrb   rY   runtimeZ	op_configZtotal_static_input_sizeZtotal_actual_input_sizeZparsed_infovariabler   Zarg_name_lowerZshape_left_boundaryZshape_right_boundaryr6   Zdtype_factorr0  r   r   Zactual_runtimer    r    r!   _compute_runtime$  sb   








z2get_standalone_cost_data.<locals>._compute_runtimer   r   Z	embeddingmatmulZ	transposeZreshapeZ	unsqueezer   divide)rY  Z	matmul_v2Z
transpose2Zreshape2Z
unsqueeze2r   r   )Zcreate_py_readerZcreate_double_buffer_readerreadZassignZfloat32r   Z_gradF)forwarddtype)rj  )Zpaddle.cost_model
cost_modelZ	CostModelZstatic_cost_dataglobal_blockrY   r\   r   r   idr  rC   rj  r0   r   r   r  r   Zget_static_op_timer   r<   )Zdistributed_programsrd  cmrk  ZDEFAULT_MULTIPLEZOP_NAME_MAPPINGZstandalone_cost_dataZnot_enum_opsZdistributed_programZ	cost_datarY   rb   rb  rj  Zforward_op_namera  Zop_namer    r    r!   get_standalone_cost_data#  sl   3





ro  c                 C   sH   |  }| }||jv r| | d S ||jv r | | d S td)Nz6Cannot find the original id in the distributed context)rm  Zoriginal_id_dist_ops_for_programZset_original_idAssertionError)Zdist_op_descr/  rW   Zop_idZop_original_idr    r    r!   set_dist_op_desc_original_id  s   



rr  c                 C   s(   | d u r| S t | ttfrt| S | gS r.   )r/   r$   tuple)r   r    r    r!   to_list  s
   rt  c                 C   s^   t j||d dtj   }t|d}|t|  W d    d S 1 s(w   Y  d S )NZ_programz.%dr[  )	r   r   r   r   r   r   openwriterC   )r^   r   r   filenamefr    r    r!   debug_program  s   "ry  c                 C   s.   ddl m} | D ]}|j| kr|  S q	d S )Nr   )get_all_process_groups)r   rz  rm  )r  rz  gr    r    r!   ring_id_to_process_group  s   

r|  c                 C   sB   ddg}| j D ]}|jD ]}|D ]}||jv r   dS qqqdS )NZ
_grad_gradZtriple_gradTF)rX   r\   r   )r^   Zhigher_order_op_suffixr_   rb   suffixr    r    r!   find_higher_order_backward_op  s   



r~  c                 C   s.   t | tsJ d| jvsJ tdd | jdS )zU
    input:
        - var: variable
    return:
        number of elemnet in var
    r(   c                 S   rW  r.   r    r   r    r    r!   r     rX  zget_var_numel.<locals>.<lambda>r   )r/   r   r6   r   r   r    r    r!   get_var_numel  s   r  c                 C   sR   t | tjjr|  S t | tjjrt | jtr| jS |  S tdt	|  d)Nzg'optimizer' must be object of class `paddle.optimizer.Optimizer` or `paddle.static.Optimizer`, but got r   )
r/   r   rH  Z	Optimizerget_lrr   Z_learning_rater\  r   r   rH  r    r    r!   r    s   r  c                 C   s  dd l }ddlm} g }d}| }|jd\}}t|| }	d }
d}| |j|j}
|
||	f |
	d i }| D ]}||j
vrCq;t|j
dkr|j
|}|dkrVd	nd
}|r|j
d }|j| d\}}t|| }| |j|j}|||f |t|d ||d}t|}||krtd||td|j
 |  n?|j
d }	 ||vr|
 \}}t|| }|||< || n|| t|d ||   td|j
 nq|  q;|
  d S )Nr      )_get_global_envi  :i   
   r   TFr   zutf-8z>Please check comm pair, the recv rank should be {} but got {}.z+It is able to instantiate {} as sender now.z+It is able to instantiate {} as recver now.)socketZ
collectiver  Zcurrent_endpointr   r0   AF_INETSOCK_STREAMbindlistenr=  r#   r%   Ztrainer_endpointsconnectsendrC   encoderecvdecoder   r~   rk   closeacceptr<   Zinstantiate)Zall_process_groupscur_rankr  r  Zhas_recv_by_socketZ	magic_numZgenvZcur_rank_ipZcur_rank_portZcur_rank_recv_portZserver_socketZ	buff_sizeZclient_socketsr   r%   Zis_sendZ	recv_rankZrecv_rank_ipZrecv_rank_portZconnect_portZclient_socketrz   Z	send_rankZ	recv_addrr    r    r!   initialize_pg_in_full_mode  s   





r  c                 C   s&   |  dod| dv od| dvS )Nr  z/auto_parallel/rc
exclude_rcr  r   r  r    r    r!   is_recompute_op7  s
   
r  c                 C   s   |  dod| dv S )Nr  r  r  r  r    r    r!   is_recompute_exclude_op?  s   r  c                 C   s  ddl m} |s
d S |j}|jsd S g }t| tjjr?t| dr;| j	j
dv r;t| jdr;| jj}t|dkr:|  n|j}n|j}|sFd S | }|||j}|  ||}	g }
d}d}|d t|	k r|dkr|	|d  }||jvr{|d7 }q_|j| d	 }|rt|d
kr|
d
t|d g n1||	| g|	|d  g\}}}|r|||}|
||d g ntd| d|d  d |d7 }|d t|	k sgt|
D ]\}}t|d
 |d D ]}|j| ddt|  qqd S )Nr  )RecomputeStategpt)ZGPTForPretrainingZGPTForPretrainingAutocheckpointsr   r(   r   Zvar_as_output_opsr   zCould not recompute op range [z] - [z] r  z/auto_parallel/rc_)Zpasses.auto_parallel_recomputer  	recomputeenabler/   r   nnZLayerhasattr	__class__rN  r  r  r#   r   rl  r\   Zbuild_statsZsort_checkpointsZvar_op_depsmaxr<   Zis_subgraphZ_update_segment_startr   debugr   r5   	_set_attrrC   )r   ZlossesZstrategyr^   r  r  Zckptsr_   Zrc_stater  segmentsZ	start_idxZpre_segment_end_idxZ	ckpt_nameZop_idx_listflagZmin_idxZmax_idxr:   segmentjr    r    r!   set_recompute_segmentsE  sx   




r  c           	      C   sx   | |}|j}|j}| |jvrt||| }n| }|d }|dkr:|j| dkr:t|j|j||}t|||fS dS )Nr   r(   r   )r   r   )	r[   r9   r8   r   r   r6   r|   r#   r%   )	r  r   rW   ra   r9   r8   r   Zbatch_size_axisZgroup_ranksr    r    r!   get_input_split_info  s    

r  c                 C   s6   | d urd | _ d | _| jrt| jtjjrd| j_| S )NT)Z_parameter_listZ_param_groupsZ
_grad_clipr/   r   r  ZClipGradByGlobalNormZ_async_add_nr  r    r    r!   validate_opt  s   
r  c                 C   sn   ddl m}m} ddlm} | j}||dg}t|dkr dnd gdd tt| jd D  }|| ||S )Nr   )r
   shard_tensorr   )get_world_process_groupZdpc                 S   r3  r.   r    r5  r    r    r!   rq     r7  z%set_data_parallel.<locals>.<listcomp>)	Z	interfacer
   r  r   r  r=  r#   r5   r6   )r   r
   r  r  Zworld_ranksr9   r>   r    r    r!   set_data_parallel  s   r  c                 C   s>   | j sdS dd | j jD }t|tt@ s| j rdS dS )NFc                 S   s   g | ]}|j qS r    r   )r1   rb   r    r    r!   rq     s    z*is_naive_data_parallel.<locals>.<listcomp>T)Zdata_parallelZ_original_serial_main_programrl  r\   r   __not_naive_data_parallel_op__)rW   Zops_typer    r    r!   is_naive_data_parallel  s   
r  c                 C   sJ   |j }|d urt|j|jdd tt|jD | _ |j| _|j| _d S )Nc                 S      g | ]}d t | qS r2   rC   ro   r    r    r!   rq     rr   z1_copy_tensor_dist_attr_to_cpp.<locals>.<listcomp>)	r9   r   r
   r6   r   r5   r#   r8   	annotated)cpp_dist_attrpy_dist_attrpy_process_meshr    r    r!   _copy_tensor_dist_attr_to_cpp  s   r  c                 C   s@   ddl m} | j }|d ur||j|jd|_ | j|_| j|_d S Nr   r	   )r6   r   )r9   r
   r6   r   r8   r  )r  r  r
   cpp_process_meshr    r    r!   _copy_tensor_dist_attr_from_cpp  s   r  c                 C   s   |j }|d urt|j|jdd tt|jD | _ |j| _|j| _|j	| _	|j
| _
|j D ]\}}| |}t|| q0|j D ]\}}| |}t|| qDd S )Nc                 S   r  r  r  ro   r    r    r!   rq     rr   z-_copy_op_dist_attr_to_cpp.<locals>.<listcomp>)r9   r   r
   r6   r   r5   r#   	impl_typeimpl_idxr&  r  inputs_dist_attrsr   get_input_dist_attrr  outputs_dist_attrsget_output_dist_attr)r  r  r  r   py_tensor_dist_attrcpp_tensor_dist_attrr    r    r!   _copy_op_dist_attr_to_cpp  s$   

r  c                 C   s   ddl m} | j }|d ur||j|jd|_ | j|_| j|_| j|_| j|_| j	 D ]\}}|
|}t|| q+| j	 D ]\}}||}t|| q?d S r  )r9   r
   r6   r   r  r  r&  r  r  r   r  r  r  r  )r  r  r
   r  r   r  r  r    r    r!   _copy_op_dist_attr_from_cpp  s,   

r  c                 C   D   | j  D ]
}t|jj|j q| j D ]
}t|jj|j qd S r.   )_dist_tensors_for_programrZ   r  r1  r   rp  r  r*  rW   r`   rc   r    r    r!   _copy_dist_attr_to_cpp     

r  c                 C   r  r.   )r  rZ   r  r1  r   rp  r  r*  r  r    r    r!   _copy_dist_attr_from_cpp"  r  r  c                 C   t   | j D ]4}| r| d ur| |}| j}t|| | r7| d ur7| |}| j}t	|| qd S r.   )
serial_ordered_nodesis_varr   get_tensor_dist_attr_for_graphr   r  is_oprb   get_op_dist_attr_for_graphr  rW   noder  r  r    r    r!    _copy_dist_attr_to_cpp_for_graph.     






r  c                 C   r  r.   )
r  r  r   r  r   r  r  rb   r  r  r  r    r    r!   "_copy_dist_attr_from_cpp_for_graph:  r  r  c                    s   t  rdS t|jdksJ dt| dt|jdks'J dt| d||j}||j}	||	ksCJ dt|t|	dd }
|
 fd	d
|jD }|
 fdd
|jD }t ||||t	j
||||ddS )z@
    dependency: prior_op should be run before posterior_op
    Nr   z9first op of dependency should at least have one output. [rU  z9second op of dependency should at least have one input. [zAtwo ops of dependency should have same mesh but got [{}] and [{}]c                 S   sH   dd | D } t | dksJ dd | D }|jdd d |d d S )	Nc                 S   s   g | ]}|j s|qS r    )r   r1   r   r    r    r!   rq   k      zTinsert_dependencies_for_two_ops.<locals>._select_best_depend_var.<locals>.<listcomp>r   c                 S   s   g | ]}|t |fqS r    )r  r  r    r    r!   rq   m  rr   c                 S   s   | d S )Nr   r    )r   r    r    r!   r   n  rX  zRinsert_dependencies_for_two_ops.<locals>._select_best_depend_var.<locals>.<lambda>)keyr(   )r#   sort)rY   Zvars_with_numelsr    r    r!   _select_best_depend_vari  s
   z@insert_dependencies_for_two_ops.<locals>._select_best_depend_varc                       g | ]}  |qS r    r  r1   r   r_   r    r!   rq   s  r  z3insert_dependencies_for_two_ops.<locals>.<listcomp>c                    r  r    r  r  r  r    r!   rq   v  r  F)r9   r&  syncr  use_nop)is_sequential_runr#   r  rC   r  r]   r9   r~   insert_dependencies_for_varsr   r  )r_   rt   Zprior_opZposterior_oprW   r&  r  r  Zprior_op_meshZposterior_meshr  Z	first_varZ
second_varr    r  r!   insert_dependencies_for_two_opsF  sV   
	r  c                 C   s  t  rdS t|tr|g}t|tr|g}|D ]
}| |js!J q|D ]
}| |js.J q$|du r;||d j}|dusAJ d}
|
rS| j|dd|id|id}n| j|d||d	d|id}|t	| |sn|d
gkrt
 }d|_d|_||_||_|j D ]}| |}||j}||| q|j D ]}| |}||j}||| q||| |	dur|dd|	  |r|   |S )zc
    dependency: op that generates prior_vars should be run before op that generates post_vars
    Nr   TZnopXZOut)r   ZinputsZoutputsZdepend)r  ZDepr(   defaultr  /)r  r/   r   Zhas_varr   r[   r9   Z_insert_op_without_syncr  r   r   r  r  r&  r   r  r   r8   r  r  r  r  Z_sync_with_cpp)r_   rt   Z
prior_varsZ	post_varsrW   Zoproler9   r&  r  r  r  Z	prior_varZpost_varZ	depend_opZdepend_op_dist_attrr#  r   r*   r$  r    r    r!   r    s   

		

r  c                 C   s   d| j v rdS dS )NZc_TFr  r  r    r    r!   is_dep_skip_op  s   
r  c                    s    fdd}|S )Nc                     sV   t j r | i |S t jj   | i |W  d    S 1 s$w   Y  d S r.   )r   Z	frameworkZin_dynamic_moder   Zdygraphguard)argsr  funcr    r!   __impl__  s
   
$z!_dygraph_guard_.<locals>.__impl__r    )r  r  r    r  r!   _dygraph_guard_  s   r  c                  C   s   t jdd } | dv S )NZ!FLAGS_new_executor_micro_batching)Nr   1TTruetrue)r   environget)Znew_executor_micro_batchingr    r    r!   use_new_executor  s   r  c                   C   s   t tdd S )NZ!FLAGS_new_executor_sequential_run)r   r   Z	get_flagsr    r    r    r!   r  	  s
   r  c                 C   s0   d }t | jD ]\}}||jv r|} |S q|S r.   )r   r   r   )rW   rz   Zpp_idxrt   r9   r    r    r!   get_pp_stage	  s   
r  r^  r+  r   c                 C   s   g }g }i }| j }|D ]}| j|}	|j|}
|
j}t||	}|| q|D ]}| j|}	|j|}
|
j}t||	}|| q)|D ]
}|j	
|||< qG|||fS )a  
    Get data used in inferring distributed attributes, including:
      1. DistTensorSpec for each input and output tensor of this dist_op.
      2. Operator attributes of this dist_op, e.g. transpose_x in matmul op.

    Args:
      dist_op: the DistributedOperator
      input_names: list, name of the dist_op's input tensors
      output_names: list, name of the dist_op's output tensors
      attr_names: list, attribute name of the dist_op's corresponding serial op

    Returns:
      input_specs: list, DistTensorSpec for each input tensor of the dist_op
      output_specs: list, DistTensorSpec for each output tensor of the dist_op
      attrs: dict, attribute map of the dist op

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('Depends on other ops.')
            >>> from paddle.distributed.auto_parallel.static.utils import wrap_data_for_completion

            >>> op_desc = dist_op.serial_op.desc
            >>> input_name_list = []
            >>> output_name_list = []

            >>> input_name_list.append(op_desc.input('X')[0]) # 'X' is the arg name for op
            >>> input_name_list.append(op_desc.input('Y')[0])
            >>> output_name_list.append(op_desc.output('Out')[0])

            >>> attr_name_list = ['trans_x', 'trans_y']
            >>> input_specs, output_specs, attrs = wrap_data_for_completion(
            ...        dist_op,
            ...        input_name_list,
            ...        output_name_list,
            ...        attr_name_list)

    )r*  r   r  r_   _var_recursiver6   r   r<   r  r   r   )rc   r^  r+  r   Zinput_specsZoutput_specsattrsr*  r   ra   r   rD   Z	dist_spec	attr_namer    r    r!   wrap_data_for_completion	  s&   *


r  Tc                 C   s8   | j j|j}|r| j|}n| j|}t||S r.   )r*  r_   r  r6   r   r  r  r   )rc   r   Zis_inputrD   ra   r    r    r!   get_dist_tensor_spec[	  s
   
r  )r   r.   )NFN)FFN)NFFNF)T)r;  r   r   rh   r   	functoolsr   numpyr   r   Zpaddle.base.wrapped_decoratorr   Zpaddle.frameworkr   Zpaddle.framework.io_utilsr   r   Zpaddle.staticr   r9   r
   Zdist_attributer   r   r   Zop_proto_and_checker_makerr   ZkOpRoleAttrNamer   ZVarDescZVarTypeZREADERZSTEP_SCOPESZLOD_TENSOR_ARRAYZFEED_MINIBATCHZ
FETCH_LISTZ__no_shape_var_type__r  Z_g_gradient_clip_opsr"   r&   r+   r-   r;   r@   rB   rE   rG   rL   rM   rO   rR   rT   re   rn   r|   r}   rs   rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r
  r  r  r  r  r  r  r  r%  r'  r2  r8  r>  r?  ro  rr  rt  ry  r|  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  Zdygraph_guardr  r  r  r$   r  r  r    r    r    r!   <module>   s  




#7&
"
:-
9
\#E'20XEy	JK
I
Y	
F