o
    "j                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ d dlmZ d dlm  m  mZ d dlmZ edZde_G dd dZG d	d
 d
ZG dd dZG dd dZG dd dZG dd dZdJddZ dd Z!dd Z"dd Z#dd Z$dd Z%d d! Z&dKd"d#Z'G d$d% d%Z(da)d&d' Z*	dLd(d)Z+d*d+ Z,d,d- Z-d.d/ Z.d0d1 Z/d2d3 Z0d4d5 Z1d6d7 Z2dMd8d9Z3d:d; Z4d<d= Z5d>d? Z6d@dA Z7G dBdC dCZ8dDdE Z9dFdG Z:dHdI Z;dS )N    N)closing)	strtobool)	frameworkrootFc                   @   s   e Zd ZdZdZdZdZdS )DistributeModez\
    There are various mode for fleetrun, each of them is designed for different model.
    r         N)__name__
__module____qualname____doc__Z
COLLECTIVEZPSPS_HETER r   r   f/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/fleet/launch_utils.pyr   &   s
    r   c                   @   s(   e Zd ZdZdZdZdZdZdZdZdS )
DeviceModez
    Training devices type
    r   r   r      N)	r	   r
   r   r   UNKNOWNCPUGPUZKUNLUNXPUr   r   r   r   r   0   s    r   c                   @   sd   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd ZdS )Clusterc                 C   s   d | _ g | _d | _d | _d S N)
job_serverpodshdfsjob_stage_flag)selfr   r   r   r   __init__>      
zCluster.__init__c                 C   s"   d | jdd | jD | j| jS )Nz/job_server:{} pods:{} job_stage_flag:{} hdfs:{}c                 S      g | ]}t |qS r   str).0podr   r   r   
<listcomp>G       z#Cluster.__str__.<locals>.<listcomp>)formatr   r   r   r   r   r   r   r   __str__D   s   zCluster.__str__c                 C   sR   t | jt |jkrdS t| j|jD ]\}}||kr dS q| j|jkr'dS dS NFT)lenr   zipr   )r   clusterabr   r   r   __eq__L   s   zCluster.__eq__c                 C   s   |  | S r   )r0   r   r-   r   r   r   __ne__Y      zCluster.__ne__c                 C   s   t  |j| _d S r   )copyr   r1   r   r   r   update_pods\   s   zCluster.update_podsc                 C   s   t |  S r   )r+   trainers_endpointsr(   r   r   r   trainers_nranks_   r3   zCluster.trainers_nranksc                 C   s
   t | jS r   )r+   r   r(   r   r   r   pods_nranksb      
zCluster.pods_nranksc                 C   s,   g }| j D ]}|jD ]}||j q
q|S r   )r   trainersappendendpoint)r   rr$   tr   r   r   r6   e   s   

zCluster.trainers_endpointsc                 C   s:   g }| j D ]}|jD ]}dd |jD }|| q
q|S )Nc                 S   r    r   r!   r#   accr   r   r   r%   p   r&   z,Cluster.world_device_ids.<locals>.<listcomp>)r   r:   acceleratorsr;   )r   r=   r$   r>   Zstr_acceleratorsr   r   r   world_device_idsl   s   

zCluster.world_device_idsc                 C   sR   g }| j D ]!}|j d|j }|jd ur|jd us!J | d|| q|S )N:z not a valid endpoint)r   addrportr;   )r   r=   r$   epr   r   r   pods_endpointst   s   
zCluster.pods_endpointsc                 C   s*   | j D ]}t|t|jkr|  S qd S r   )r   r"   id)r   Zpod_idr$   r   r   r   get_pod_by_id~   s
   
zCluster.get_pod_by_idN)r	   r
   r   r   r)   r0   r2   r5   r7   r8   r6   rB   rG   rI   r   r   r   r   r   =   s    
r   c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
	JobServerc                 C   s
   d | _ d S r   r<   r(   r   r   r   r      r9   zJobServer.__init__c                 C   s   | j  S r   rK   r(   r   r   r   r)      s   zJobServer.__str__c                 C   s   | j |jkS r   )Zendpintr<   r   jr   r   r   r0      r3   zJobServer.__eq__c                 C   
   | |k S r   r   rL   r   r   r   r2      r9   zJobServer.__ne__N)r	   r
   r   r   r)   r0   r2   r   r   r   r   rJ      s
    rJ   c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )Trainerc                 C   s   g | _ d | _d | _d | _d S r   )rA   r<   rankstager(   r   r   r   r      r   zTrainer.__init__c                 C   s   d | j| j| jS )Nz"accelerator:{} endpoint:{} rank:{})r'   rA   r<   rP   r(   r   r   r   r)      s   zTrainer.__str__c                 C   s^   t | jt |jkrdS | j|jks| j|jkrdS t| j|jD ]\}}||kr, dS q!dS r*   )r+   rA   r<   rP   r,   )r   r>   r.   r/   r   r   r   r0      s   zTrainer.__eq__c                 C   rN   r   r   )r   r>   r   r   r   r2      r9   zTrainer.__ne__c                 C      | j S r   rP   r(   r   r   r   rP         zTrainer.rankN)r	   r
   r   r   r)   r0   r2   rP   r   r   r   r   rO      s    rO   c                   @   D   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dS )Podc                 C   sF   d | _ d | _d | _d | _g | _g | _g | _g | _g | _g | _	d | _
d S r   )rP   rH   rD   rE   r:   serversworkerscoordinatorsheter_workersrA   device_moder(   r   r   r   r      s   
zPod.__init__c                 C   sb   d | j| j| j| j| jdd | jD dd | jD dd | jD dd | j	D dd | j
D 
S )Nzrank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{}             workers:{} heter_workers:{} coordinators:{}c                 S   r    r   r!   )r#   r>   r   r   r   r%      r&   zPod.__str__.<locals>.<listcomp>c                 S   r    r   r!   )r#   sr   r   r   r%      r&   c                 S   r    r   r!   )r#   wr   r   r   r%      r&   c                 S   r    r   r!   )r#   hr   r   r   r%      r&   c                 S   r    r   r!   )r#   cr   r   r   r%      r&   )r'   rP   rH   rD   rE   rA   r:   rW   rX   rZ   rY   r(   r   r   r   r)      s   zPod.__str__c                 C   s  | j |j ks| j|jks| j|jks| j|jkr%td|  d|  dS t| jt|jkr>td| j d|j  dS tt| jD ] }| j| |j| kretd| j|  d|j|    dS qEt| j	t|j	krtd| j	 d|j	  dS tt| j	D ] }| j	| |j	| krtd| j	|  d|j	|    dS qt| j
t|j
krtd| j
 d|j
  dS tt| j
D ] }| j
| |j
| krtd| j
|  d|j
|    dS qdS )	Nzpod z != Fz	trainers ztrainer zservers zworkers T)rP   rH   rD   rE   loggerdebugr+   r:   rangerW   rX   )r   r$   ir   r   r   r0      s>   """z
Pod.__eq__c                 C   rN   r   r   )r   r$   r   r   r   r2      r9   z
Pod.__ne__c                 C   s   d S r   r   )r   Zres_podsr   r   r   parse_response   s   zPod.parse_responsec                 C   rR   r   rS   r(   r   r   r   rP      rT   zPod.rankc                 C   sF   d}| j D ]	}|| d7 }q|dksJ d|  d|d d }|S )N ,z	this pod z can't see any acceleratorsr   )rA   )r   r=   gr   r   r   get_visible_accelerators  s   
zPod.get_visible_acceleratorsN)
r	   r
   r   r   r)   r0   r2   rd   rP   rh   r   r   r   r   rV      s    'rV      c                 C   s>   t |}||  t  }t d}|| || |S )Nz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)logging	getLoggersetLevelStreamHandler	FormattersetFormatter
addHandler)Z	log_levelnamer`   Zlog_handlerZ
log_formatr   r   r   
get_logger  s   



rr   c                 C   sn  t |tu s
J dtd d}d}t| D ]\}}t }	||	_||	_||	_|| }
t|
t|ks5J dt	t|D ]h}t
 }|tjkrpt|| ttfr_|j||  |	j||  n0|j||  |	j||  n|tjkrt|| ttfr|j||  n|j||  d|
|  |_||_|d7 }|	j| q;|j|	 q| |}||j| fS )Ntrainer_endpoints must be listr   r   zMcurrent trainer_endpoints size should be greater equal than acclerators size.%sr   )typelistr   	enumeraterV   rP   rD   r[   r+   rb   rO   r   r   
isinstancetuplerA   extendr;   r   r<   r:   r   index)node_ipsnode_iptrainer_endpointsr[   devices_per_procr-   Ztrainer_rank	node_rankipr$   cur_node_endpointsrc   trainerpod_rankr   r   r   get_cluster  sB   



r   c                 C   s0  t jdkr4| D ]'}|j d u r.t t |jjtj |j	r$|j	
  td|jj  qtd | D ] }|j d u rV|j  |j	rL|j	
  td|jj  q6td tddD ]*}d}| D ]}|j d u r{t |jjtj d	}qg|std
  d S td qatd td d S )Nntzterminate process group gid:r   zterminate process id:r   r   2   FTzterminate all the procszcan't kill all process and exit)osrq   procpollkillpggetpgidpidsignalSIGTERMlog_fncloser`   infotimesleep	terminatera   rb   killSIGKILLfatalsysexit)procspstepaliver   r   r   terminate_local_procsD  s<   







r   c                  C   s*   zt  } t | }| |fW S    Y d S r   )socketgethostnamegethostbyname)Z	host_namehost_ipr   r   r   get_host_name_ipj  s   

r   c                 K   s6   |t krtn|}|jd|  f|||d d| dS )ad  Add argparse's argument.

    Examples:
        .. code-block:: python

            >>> import argparse
            >>> from paddle.distributed.fleet.launch_utils import add_arguments
            >>> parser = argparse.ArgumentParser()
            >>> add_arguments("name", str, "Jonh", "User name.", parser)
            >>> args = parser.parse_args()

    z--z Default: %(default)s.)defaultrv   helpN)boolr   add_argument)argnamerv   r   r   Z	argparserkwargsr   r   r   add_argumentss  s   
r   c                 C   sZ   dd }t  }d}	 | }||vr|| t|| kr|S |d7 }|dkr,td d S q
)Nc               
   S   sj   t ttjtj!} | tjtjtddd | 	d | 
 d W  d    S 1 s.w   Y  d S )Niir   r   )re   r   )r   r   AF_INETSOCK_STREAM
setsockopt
SOL_SOCKET	SO_LINGERstructpackbindgetsockname)r\   r   r   r   __free_port  s   

$z$find_free_ports.<locals>.__free_portr   Tr   i  z?can't find avilable port and use the specified static port now!)setaddr+   print)numr   Zport_setr   rE   r   r   r   find_free_ports  s    

r   c                 C   sX   t jdd u rt| }|d urt|}|S tt jd}t|| || |  d}|S )NFLAGS_START_PORTr   )r   environgetr   rw   intrb   )r   offsetports
start_portr   r   r   	get_ports  s   r   c                 C   sN  d}d}d}|   D ]\}}t|t|}q
dd|d| | }dd| d| d	 }|| | }	d
ddg|	  d }
d
ddg|	  d }d}||
d 7 }|ra|||d |d 7 }n||dd7 }||d 7 }|   D ]'\}}t|trt||krd|dd   }n|}|||d| t|7 }qs||
7 }d| d}|S )Nr   (   -   z    z|{{:>{}s}}{}{{:^{}s}}|
 z|{:>zs}{}{:^zs}|
z    +re   =+-
r   r   zfleetrun Distributed EnvsValuez... i)itemsmaxr+   r'   joinry   r"   )envsheaderspacingZmax_kZmax_vkvZh_formatZl_formatlengthborderlineZdrawsZstr_v_strr   r   r   pretty_print_envs  s4   
r   c                   @   s   e Zd Zdd ZdS )TrainerProcc                 C   s(   d | _ d | _d | _d | _d | _d | _d S r   )r   r   
log_offsetrP   
local_rankcmdr(   r   r   r   r     s   
zTrainerProc.__init__N)r	   r
   r   r   r   r   r   r   r     s    r   c                  G   sJ   t | dksJ dt |  dt | dkr#t| d tsJ | d atS )Nr   z
len(args) z should <= 1r   )r+   ry   r   _run_with_coverage)argsr   r   r   run_with_coverage  s
    r   c              
   C   s  |d u rt  tj  }nt  |}|dd  |dd  |  }dd |D }g }	t|jD ]\}
}d|j d|j d| 	  d
|  t|
d
dd |jD d
|d	}|d
d d urj|d
 |d
< |dd d urx|d |d< |dd d ur|d |d< t|jdkr|jtjkrdd
dd |jD  |d< t|jdkrdd
dd |jD  |d< tj rt|jdkrdd
dd |jD  |d< || g }t stjdddkrg d}tjdg| |g | }td| d|  |
dkr!tdt|jt|d td| d| d d }tjd kr+d ntj }|d urtj!|d!d" tj"#d#| rKt$| d$ t%d#| d%}|&d& |&d'
|   W d    n	1 smw   Y  |dd ur|d(' d)krt%d*||
f d+}n	t%d,||
f d+}t(j)|||||d-}nt(j)|||d.}t* }||_+|j|_|
|_,||_-|r|. nd |_/||_0|	1| q0|	S )/N
http_proxyhttps_proxyc                 S   s   g | ]}d  |qS rC   )r   )r#   Zeler   r   r   r%     s    z(start_local_trainers.<locals>.<listcomp>z%dru   rf   c                 S   r    r   r!   r?   r   r   r   r%     r&   )PADDLE_TRAINER_IDZPADDLE_CURRENT_ENDPOINTPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSZPADDLE_RANK_IN_NODEZPADDLE_LOCAL_DEVICE_IDSZPADDLE_WORLD_DEVICE_IDSZPADDLE_CLUSTER_TOPO_PATHPADDLE_RANK_MAPPING_PATHZPADDLE_ENABLE_AUTO_MAPPINGr   c                 S   r    r   r!   r#   rg   r   r   r   r%   "  r&   FLAGS_selected_gpusc                 S   r    r   r!   r   r   r   r   r%   '  r&   ZFLAGS_selected_acceleratorsc                 S   r    r   r!   r   r   r   r   r%   ,  r&   FLAGS_selected_xpusZWITH_COVERAGEZOFFON)z-mZcoveragerunz--branchz-p-uzstart trainer procz  env:zYLocal start {} processes. First process distributed environment info (Only For Debug): {}zDistributed Envsr   z7details about PADDLE_TRAINER_ENDPOINTS can be found in z7/endpoints.log, and detail running logs maybe found in z/workerlog.0r   Texist_okz%s/endpoints.logz/endpoints.logr]   zPADDLE_TRAINER_ENDPOINTS: 
r   ZPADDLE_NEED_RANK_MAPPINGtruez%s/prelaunchlog.%dr.   %s/workerlog.%d)envstdoutstderr
preexec_fn)r   r   )2r4   r   r   poprB   rx   r:   rP   r<   r7   r   r6   r"   rA   r   r+   r[   r   r   r   coreis_compiled_with_xpuupdater   r   
executabler`   ra   r   r'   r   rq   setsidmakedirspathexistsremoveopenwritelower
subprocessPopenr   r   r   r   tellr   r   r;   )r-   r$   training_scripttraining_script_argslog_dirr   current_envZidsresr   idxr>   proc_envZcoverage_argsr   fnZpre_fnfr   tpr   r   r   start_local_trainers  s   









r  c              
   C   s   | j rIt| j jd5}|| jd |D ]}ztj| W q ty1   tjd| j j  Y qw |	 | _W d    d S 1 sBw   Y  d S d S )Nr=   r   zSUnicodeEncodeError occurs at this line. Please refer to the original log file "%s"
)
r   r  rq   seekr   r   r   r  UnicodeEncodeErrorr  )r  Zfinr   r   r   r   pull_worker_logq  s    "r  c              	   C   s   z@d}g }d}| D ]&}|j r|jdkrt| |j }|d u r#d}q	|dkr/d}||j q	|r>t|  t	d W |S W |S  t
yS   td t|  Y d S  tyg   td|| t|      td|| t|  Y d S )NFr   Tr   zKeyboardInterrupt, exitzdABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.)r   r   r  r   r   r;   rP   r   r   r   KeyboardInterruptr`   warning
SystemExiterrorr'   )r   Znranksr  Z
error_rankr   r   retr   r   r   watch_local_trainers  sP   

r  c                       | d u rt j }dd td|D }|S td}|d u s"|dkr.dd | dD }|S |d | dD ]}| v sIJ d| d	| d
q8 fdd| dD }td|  d| d   |S )Nc                 S   r    r   r!   r#   xr   r   r   r%     r&   zget_gpus.<locals>.<listcomp>r   CUDA_VISIBLE_DEVICESre   c                 S      g | ]}|  qS r   stripr  r   r   r   r%     r&   rf   zCan't find your gpus z in CUDA_VISIBLE_DEVICES[].c                       g | ]	}  | qS r   r|   r!  r  cuda_visible_devices_listr   r   r%         z1Change selected_gpus into reletive values. --ips: will change into relative_ips:z( according to your CUDA_VISIBLE_DEVICES:)	r   r   get_cuda_device_countrb   r   getenvsplitr`   r   )gpusgpus_numZres_gpuscuda_visible_devicesr  r   r%  r   get_gpus  8   




r/  c                    r  )Nc                 S   r    r   r!   r  r   r   r   r%     r&   zget_xpus.<locals>.<listcomp>r   XPU_VISIBLE_DEVICESre   c                 S   r  r   r   r  r   r   r   r%     r&   rf   zCan't find your xpus z in XPU_VISIBLE_DEVICES[r"  c                    r#  r   r$  r  Zxpu_visible_devices_listr   r   r%     r'  z1Change selected_xpus into reletive values. --ips:r(  z' according to your XPU_VISIBLE_DEVICES:)	r   r   get_xpu_device_countrb   r   r*  r+  r`   r   )xpusZxpus_numZres_xpusZxpu_visible_devicesr  r   r2  r   get_xpus  r0  r5  c                 C   s   | dkr*t j rt j dkrtd tjS t j r*t j dkr*td tj	S | dkr<t j dkr<td tjS | dkrNt j dkrNtd tj	S | d	krYtd
 tj
S td)Nheterr   z+launch train in heter mode with GPU device.z+launch train in heter mode with XPU device.ncclzlaunch train in GPU mode!bkclzlaunch train in XPU modegloozlaunch train in CPU modezDon't supported devices)r   r   is_compiled_with_cudar)  r   r   r   r   r3  r   r   RuntimeErrorbackendr   r   r   get_device_mode  s*   r>  c                    s  t | j}g }|tjkrQt| j | jd urKt t| j dks,J d	t | jtt t| j  fddt
dt D }||fS  }||fS |tjkrt| j| jd urtt| j dksvJ d	t| jttt| j fddt
dtD }||fS }||fS |tjkrt| dr| jd u rt | _| jd u rdg}||fS tt
d| j}||fS td| d	)
Nr   z4gpus' number:{} mod args.nproc_per_node:{} must == 0c                    s   g | ]
} ||  qS r   r   r#   rc   )r,  nr   r   r%         z(get_device_proc_info.<locals>.<listcomp>z4xpus' number:{} mod args.nproc_per_node:{} must == 0c                    s   g | ]
}||   qS r   r   r?  )r@  r4  r   r   r%   '  rA  Zpaddle_cpuonlyzCan't support device_mode:z, support only cpu|gpu|xpu now.)r>  r=  r   r   r/  r,  Znproc_per_noder+   r   r'   rb   r   r5  r4  r   hasattrmultiprocessing	cpu_countrw   AssertionError)r   r[   r   r   )r,  r@  r4  r   get_device_proc_info
  sV   




 



 



rF  c                 C   s*   t jd| jg| j }t|}|  d S )Nr   )r   r   r  r  r  r  wait)r   r   r   r   r   r   direct_start:  s   
rH  c                 C   sn   | dusJ g }|  dD ]"}| dd }| dd }t|| }|d|t|f qd|}|S )zM
    origin_endpoint: ip:port
    user_define_endpoint: ip:(port+offset)
    Nrf   rC   r   r   )r+  r   r;   r   r"   )Zorigin_endpointsr   Z!paddle_user_define_endpoints_listZip_portr   rE   Znew_portZpaddle_user_define_endpointsr   r   r   get_custom_endpointsE  s   
rI  c                 C   s   t |tu s
J d|tjksJ dtd d}t| D ]D\}}t }||_||_||_	|| }	|| }
t
|
dks<J tt
|
D ]}t }d|	|  |_|
| |_|j| qB|j| q| |}||j| fS )Nrs   ,Only support get mapped cluster for gpu now.rt   r   ru   )rv   rw   r   r   r   rx   rV   rP   rD   r[   r+   rb   rO   r<   r:   r;   r   r|   )r}   r~   r   r[   
node_ranksr-   r   r   r$   r   ranks_per_noderc   r   r   r   r   r   'get_mapped_cluster_without_rank_mapping}  s*   


rM  c                    s  |t jks	J dtj }d }t| jd}t|}W d    n1 s&w   Y  g }g }t	|d D ]\}}|
|d  |
|g q5t|dkrR|d }	n| jrY| j}	nt \}
}	|	|v smJ d|	 d| d	||	}t|t|ks~J d
td| d|	 d| d||   g }g }|D ]W | }tjdd urttdd}tt||t||  }n&tjdd urttjd}tt||t||  }ntt|| }|
 fdd|D  qt||	|||S )NrJ  r=   ZmachinesrD   r   r   Can't find your local ip {} in node_ips: {}+ranks length should be equal to ips length.parsed from args: node_ips:	 node_ip: node_rank: node_ranks:PADDLE_PORTre   r   c                       g | ]}d  |f qS z%s:%dr   r#   rE   r   r   r   r%         zEget_mapped_cluster_from_args_without_rank_mapping.<locals>.<listcomp>)r   r   r   r   r)  r  Zcluster_topo_pathjsonloadrx   r;   r+   hostr   r|   r`   ra   r   r   r   r   r*  rw   rb   r   rM  )r   r[   r-  Zcluster_topo	json_filer}   rK  r  Zcur_cluster_topor~   _r   
free_portsr   r   r   rZ  r   1get_mapped_cluster_from_args_without_rank_mapping  sj   






rb  c                 C   s  t |tu s
J d|tjksJ ddd }td d}t| D ]^\}}	t }
||
_|	|
_||
_	|| }|| }|| }t
t|D ]5}t }|d t||  }t|dks[J d|j||d	  d
||  |_|| |_|
j| qB|j|
 q | |}||j| fS )Nrs   rJ  c                 S   sN   t d}|d u s|dkr| S |d}|t| }td| || |S )Nr  re   rf   z<Change gpu id from {} to {} based on CUDA_VISIBLE_DEVICES {})r   r*  r+  r|   r"   r`   r   r'   )Zgpu_idr.  r&  Zrelative_idr   r   r   get_relative_gpu_id  s   

zAget_mapped_cluster_with_rank_mapping.<locals>.get_relative_gpu_idrt   ranksr   z.Only support one process to one device mappingr   ru   )rv   rw   r   r   r   rx   rV   rP   rD   r[   rb   r+   rO   r"   rA   r;   r<   r:   r   r|   )r}   r~   r   r[   rK  node_rank_mappingsrc  r-   r   r   r$   r   rL  Zcur_node_rank_mappingrc   r   Zlocal_device_idsr   r   r   r   $get_mapped_cluster_with_rank_mapping  s>   




rf  c                    sD  |t jks	J dtj }| jptd}d }t|d}t	
|}W d    n1 s-w   Y  dtjd< g }g }g }|D ]$}	||	d  dd t|	d  D }
|
  ||
 ||	 q?t|d	kro|d
 }n| jrv| j}nt \}}||v sJ d| d| d||}t|| |ksJ dt|t|ksJ dtd| d| d| d||   g }g }|D ]X | }tjdd urttdd}tt||t||  }n'tjdd urttjd}tt||t||  }ntt|| }| fdd|D  qt||||||S )NrJ  r   r=   re   rD   c                 S   r    r   r   r?  r   r   r   r%   (  s    zBget_mapped_cluster_from_args_with_rank_mapping.<locals>.<listcomp>rd  r   r   rN  rO  rP  zGnumber of ranks mapped to one node should not exceed the avaiable ones.rQ  rR  rS  rT  rU  rV  r   c                    rW  rX  r   rY  rZ  r   r   r%   Y  r[  )r   r   r   r   r)  rank_mapping_pathr   r*  r  r\  r]  r   r;   rw   keyssortr+   r^  r   r|   r`   ra   r   r   rb   r   rf  )r   r[   r-  rh  Zrank_mappingr_  r}   rK  re  Zcur_rank_mappingZcur_node_rank_listr~   r`  r   ra  r   r   r   rZ  r   .get_mapped_cluster_from_args_with_rank_mapping  s   








rk  c                   @   rU   )ParameterServerLauncherc                 C   s   || _ || _d| _d| _d| _d| _d| _d| _g | _g | _	d| _
g | _g | _d| _g | _g | _d| _g | _g | _d| _d| _g | _i | _g | _i | _d| _| | d S )NFr   re   T)r   distribute_modewith_coordinator
server_num
worker_numheter_worker_numcoordinator_numserver_endpointsserver_endpoints_ipsserver_endpoints_portworker_endpointsworker_endpoints_ipsworker_endpoints_portheter_worker_endpointsheter_worker_endpoints_ipsheter_worker_endpoints_portcoordinator_endpointscoordinator_endpoints_ipscoordinator_endpoints_portis_localcurrent_node_ipstage_trainer_numstage_heter_map
stage_liststage_device_map	stage_numget_role_endpoints)r   r   rm  r   r   r   r   f  s6   z ParameterServerLauncher.__init__c              
   C   s*	  |j r;|j | _ |jr)t|jd| j ks$J dt|jd| j |j| _n(t| j d}ddd |D | _n|jdksDJ d|j| _t| jd| _ |jr|j| _|j	rzt|j	d| jksuJ dt|j	d| j|j	| _
nqt| j| j }dd	d |D | _
n^|j	dksJ d
dd |j	dD }t|| _dd |j	dD }d|v rd}t|| j  || j  | j d}g }t| jD ]}|d|| t|| f qd|| _
n|j	| _
|jr/d| _|j| _|jrt|jd| jksJ dt|jd| j|j| _nt| jd}ddd |D | _td | jtjkrZ|jdks@J dd| jd< |jd}	tt|	D ]}|	| | j|d < qQ| j
| jd< |jr|jd| _dd | jD | _|jrGt|jdt| jksJ dt|jdt| j|jd}
d| _tt| jD ]}| jdkr|  jd7  _|
| d}t|| j| ksJ d| ddd |D }dd |D }d|v rtt|| j| j  | j }g }tt|D ]}|d|| t|| f qd|}nd|}|| j|d < | j|d gt|d  |  j| j| 7  _|  j|7  _qntt| jD ]P}| j| }t|| j | j | j }ddd |D }|| j|d < | j|d gt|d  |  j|7  _| jdkr|  jd7  _|  j|7  _qNn|jdksJ dg | _|jd}
d| _tt|
D ]}|
| d}| jt| d d |D }d!d |D }d|v rtt|| j| j  | j }g }tt|D ]}|d|| t|| f qd|}nd|}|| j|d < | j|d gt|d  |  j| jd" 7  _| jdkrC|  jd7  _|  j|7  _q| jg| j | _t| j| _ |j!rc|j!g}ntd| j | j | j }| jdd dd }|d t|d  | _!d#d | jdD | _"d$d | j
dD | _#| jrd%d | jdD | _$d&d | jdD | _%d'd | jdD | _&d(d | j
dD | _'g | _(| j"D ]}|| j(vr| j(| q| j#D ]}|| j(vr| j(| q| jtjkr.d)d | jdD | _)d*d | jdD | _*| j)D ]}|| j(vr,| j(| qtt+| j(dkrBd| _,| j(d | _-n3d+| _,t./d,d }|d u rWt0 \}| _-n|| _-| jtjksu| j-| j(v suJ d-| j- d.| j( d/| j-| j(v r| j(1| j-| _2t34d0| j(| j-| j2 d S d S )1Nrf   zThe server_num and servers doesn't match. Expect servers endpoints num epual to server_num, but received servers enpoint num: {} and server_num {}r   c                 S      g | ]}d t | qS z
127.0.0.1:r!   r  r   r   r   r%     r[  z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>re   z?The setting of Parameter-Server must has server_num or servers.zThe worker_num and workers doesn't match. Expect workers endpoints num epual to worker_num, but received workers enpoint num: {} and worker_num {}c                 S   r  r  r!   r  r   r   r   r%     r[  z?The setting of Parameter-Server must has worker_num or workers.c                 S      g | ]}|  d d qS rC   r   r!  r+  r  r   r   r   r%         c                 S      g | ]}t | d qS r   r+   r!  r+  r  r   r   r   r%     r  r   i  rC   TzThe coordinator_num and coordinators doesn't match. Expect coordinators endpoints num epual to coordinator_num, but received coordinator enpoint num: {} and coordinator_num {}c                 S   r  r  r!   r  r   r   r   r%     r[  z2>>> use default coordinator addr(only one process)zBThe setting of Parameter-Server heter mode must has heter_devices.cpu;r   c                 S   r    r   rg  )r#   Ztrainer_numr   r   r   r%     s    zThe stage_num and heter_workers doesn't match. Expect heter_workers endpoints stage num epual to heter_worker_num stage, but received heter_workers enpoint stage num: {} and heter_worker_num stage {}zThe heter trainer num in stage z= is not equal in args.heter_worker_num and args.heter_workersc                 S   r  r  r  r  r   r   r   r%         c                 S   r  r   r  r  r   r   r   r%     r  c                 S   r  r  r!   r  r   r   r   r%   <  r[  zVThe setting of Parameter-Server heter mode must has heter_worker_num or heter_workers.c                 S   r  r  r  r  r   r   r   r%   T  r  c                 S   r  r   r  r  r   r   r   r%   W  r  r   c                 S   r  r  r  r  r   r   r   r%     r  c                 S   r  r  r  r  r   r   r   r%     r  c                 S   r  r  r  r  r   r   r   r%     r  c                 S   r  rC   r   r  r  r   r   r   r%     r  c                 S   r  r  r  r  r   r   r   r%     r  c                 S   r  r  r  r  r   r   r   r%     r  c                 S   r  r  r  r  r   r   r   r%     r  c                 S   r  r  r  r  r   r   r   r%     r  FPOD_IPrN  z)} in args.servers and args.workers ips: {rP  z=parsed from args: node_ips:{} current_node_ip:{} node_rank:{})5ro  rW   r+   r+  r'   rs  r   r   rp  rX   rv  rb   r;   r"   rr  rn  rY   r|  r   rm  r   r   Zheter_devicesr  r  rq  Zstage_heter_trainer_numrZ   ry  r  r{   r  r  	http_portrt  rw  r}  r~  ru  rx  r}   rz  r{  r   r  r  r   r*  r   r|   r   r`   ra   )r   r   r   rw  Zworker_endpoints_lenr   rx  rv  rc   Zheter_devices_listZheter_worker_endpoints_listry  rz  Zheter_worker_endpoints_lenr{  Znew_heter_worker_endpointsrM   Zip_port_listZheter_trainer_numr  Zhttp_ipr   Zpod_ipr`  r   r   r   r    s>  












2
















z*ParameterServerLauncher.get_role_endpointsc                 C   s  | j | jvrd S td d}d}d}d}d}t| jD ]\}}t }||_||_tt| j	D ]$}	|| j	|	 krRt
 }
| d| j|	  |
_||
_|d7 }|j|
 q.tt| jD ]'}|| j| krt
 }| d| j|  |_||_d|_|d7 }|j| qZtt| jD ]'}|| j| krt
 }| d| j|  |_||_d|_|d7 }|j| qtt| jD ])}|| j| krt
 }d|| j| |_||_| j| |_|d7 }|j| q|j| q|j| j }t | _g g g g d| _ g g g g d| _!g g g g d| _"| #| j$| | %| j$| | j&r%| '| j$| | j(t)j*kr3| +| j$| t,-d| j$j.| j$j.| j$j.| j$j. t| j d dkrt| j d D ]"\}	}| j d |	 j/0  t| j"d dkrx| j"d |	 1  qWt,-d	 t| j d
 dkrt| j d
 D ]\}	}| j"d
 |	 1  | j d
 |	 j/2  qt,-d t| j d dkrt| j d D ]\}	}| j"d |	 1  | j d |	 j/2  qt,-d t| j d dkrt| j d D ]\}	}| j"d |	 1  | j d |	 j/2  qt,-d nBt| j d dkr.t| j d D ]\}	}| j d |	 j/0  qt| j d
 dkrOt| j d
 D ]\}	}| j d
 |	 j/0  q?t3j45| jr_t67| j d S d S )Nrt   r   rC   r   z{}:{})workercoordinatorserverheter_workerzPlease check servers, workers, coordinator and heter_worker logs in {}/workerlog.*, {}/serverlog.* , {}/coordinatorlog.*, and {}/heterlog.*r  zDall workers exit, going to finish parameter server and heter_worker.r  zall heter_worker are killedr  zall parameter server are killedr  zall coordinators are killed)8r  r}   r   rx   rV   rP   rD   rb   r+   rt  rO   ru  r<   rW   r;   rw  rx  rQ   rX   r}  r~  rY   rz  r'   r{  r  rZ   r   r   tempfilemkdtempgloo_rendezvous_dirr   cmdslog_fnsstart_pod_serverr   start_pod_workerrn  start_pod_coordinatorrm  r   r   start_pod_heter_workerr`   r   r	  r   rG  r   r   r   r   r   shutilrmtree)r   r-   Zserver_rankZworker_rankZheter_worker_rankZcoordinator_rankr   r   r$   rc   r  rM   r  mr  r   r  r   r   r   r   start_ps  s   





z ParameterServerLauncher.start_psc                 C   s  t j }t|}|dd  |dd  t|jD ]\}}| jtjkrP| j	| j
| j| j|jdd dt| j|jdd tt ddd	| j| jd
}n(| j	| j
| j|jdd dt| j|jdd tt ddd	| j| jd}|| tjd|jg|j }| jd | |dkrtdt|jt|d |j d urt j!|j dd t"d|j |f d}	| j#d |	 t$j%|||	|	d}
nt$j%||d}
t& }|
|_'|j(|_(||_)|	|_*|	r|	+ nd |_,||_-| j.d | qd S )Nr   r   rC   r   ZPSERVERr   PADDLE_WITH_GLOO03)PADDLE_PSERVERS_IP_PORT_LISTr   PADDLE_COORDINATOR_ENDPOINTS%PADDLE_ALL_HETER_TRAINER_IP_PORT_LISTrV  TRAINING_ROLEr   r  r  PADDLE_GLOO_RENDEZVOUSPADDLE_GLOO_FS_PATHPADDLE_GLOO_HTTP_ENDPOINT)r  r   r  rV  r  r   r  r  r  r  r  r   r  z`Local server start {} processes. First process distributed environment info (Only For Debug): {}r   Tr   z%s/serverlog.%dr]   r   r   r   r   )/r   r   r4   r   rx   rW   rm  r   r   rs  rv  r|  ry  r<   r+  r"   rp  r*  r  r  r   r   r   r  r  r  r;   r`   r   r'   r+   r   r	  r   r  r  r  r  r   r   rP   r   r   r  r   r   r   )r   r   r$   default_envr
  r  Z
cur_serverr  r   r  r   r  r   r   r   r  T  s   




z(ParameterServerLauncher.start_pod_serverc              	   C   s.  t j }t|}|dd  |dd  d}g }tj r)t|j}t	|}ntj
 r=tj }dd td|D }t|jD ]Q\}}|dkrMdnt|||  }	| jtjkri d| jd| jd	t| jd
| jdt| jdddt| jddd| jd d| jd| jd ddd|jdd d|jdd dt|jdtt dddd| j dd|	|	| j!d}
nOi d| jd| jd	t| jddd
| jd|jdd d|jdd dt|jdtt ddddd | j d!dd"dd#|	d$|	d%| j!}
|"|
 t#j$d&|j%g|j& }| j'd' (| |dkr>t)*d(+t	|jt,|
d) |j-d urht j.|j-d*d+ t/d,|j-|f d-}| j0d' (| t1j2||||d.}nt1j2||d/}t3 }||_4|j|_||_5||_6|r|7 nd |_8||_9| j:d' (| qBd S )0Nr   r   r   c                 S   r    r   r!   r  r   r   r   r%     r&   z<ParameterServerLauncher.start_pod_worker.<locals>.<listcomp>r  r  r   r   r  PADDLE_STAGE_TRAINERS_NUMSTAGE_ID1	STAGE_NUM*PADDLE_PREVIOUS_HETER_TRAINER_IP_PORT_LISTre   &PADDLE_NEXT_HETER_TRAINER_IP_PORT_LISTr   r  HETER_DEVICE_TYPEr   r  ZTRAINERr  rC   rV  r   r  r  r  )r  r   r   r  r1  r  r  r   r   r  r1  r  r   r  z`Local worker start {} processes. First process distributed environment info (Only For Debug): {}r   Tr   r   r]   r  r  );r   r   r4   r   r   r   r:  r/  r,  r+   r   r3  rb   rx   rX   r"   rm  r   r   rs  rv  rp  r|  r  r  r  ry  r  r<   r+  rP   r*  r  r  r   r   r   r  r  r  r;   r`   r   r'   r   r	  r   r  r  r  r  r   r   r   r   r  r   r   r   )r   r   r$   r  r
  heter_device_numdevice_listr  Z
cur_worker	device_idr  r   r  r   r  r   r   r   r    s  









	




	



z(ParameterServerLauncher.start_pod_workerc              	   C   s  t d tj }t|}|dd  |dd  t|jD ]\}}d}i d| jd| jdt	| j
d| jd	t	| jd
dd|jdd d|jdd dt	|jdt	tddddd| jddddd|d|d| j}|| tjd|jg|j }	| jd |	 |dkrtdt|jt|d |jd urtj |jdd  t!d!|j|f d"}
| j"d |
 t#j$|	||
|
d#}nt#j$|	|d$}t% }||_&|j|_||_'|
|_(|
r|
) nd |_*|	|_+| j,d | qd S )%Nz">>> entering start_pod_coordinatorr   r   r  r  r   r   r  ZPADDLE_COORDINATOR_NUMr  ZCOORDINATORr  rC   r   rV  r   r   r  r  r  r  r   r   r  r1  r  r   r  zeLocal coordinator start {} processes. First process distributed environment info (Only For Debug): {}r   Tr   z%s/coordinator.%dr]   r  r  )-r   r   r   r4   r   rx   rY   rs  rv  r"   rp  r|  rr  r<   r+  rP   r*  r  r  r   r   r   r  r  r  r;   r`   r   r'   r+   r   r	  r   r  r  r  r  r   r   r   r   r  r   r   r   )r   r   r$   r  r
  r  Zcur_coordinatorr  r  r   r  r   r  r   r   r   r    s   




	



z-ParameterServerLauncher.start_pod_coordinatorc              	   C   s  t j }t|}|dd  |dd  d}g }tj r)t|j}t	|}ntj
 r=tj }dd td|D }t|jD ]	\}}|dkrMdnt|||  }	|j}
i d| jd| jd	|
| jd
 krp| j|
d
  ndd| j|
d
  d| jd| j|
 dt|
dt| jd|jdd
 dddt| jdt| jd|jdd dtt ddddd| jddd|	|	| jd}|| tj d|j!g|j" }| j#d $| |dkrt%&d 't	|jt(|d! |j)d ur t j*|j)d"d# t+d$|j)|f d%}| j,d $| t-j.||||d&}nt-j.||d'}t/ }||_0|j1|_1||_2||_3|r>|4 nd |_5||_6| j7d $| qBd S )(Nr   r   r   c                 S   r    r   r!   r  r   r   r   r%   \  r&   zBParameterServerLauncher.start_pod_heter_worker.<locals>.<listcomp>r  r  r   r  r   re   r  r  r  r  r  rV  rC   r  ZHETER_TRAINERr   r  r  r  r  r  r  r   )r   r  r1  r  r   r  zfLocal heter_worker start {} processes. First process distributed environment info (Only For Debug): {}r   Tr   z%s/heterlog.%dr]   r  r  )8r   r   r4   r   r   r   r:  r/  r,  r+   r   r3  rb   rx   rZ   r"   rQ   rs  rv  r  r  ry  r  r<   r+  rp  r  r*  r  r  r   r   r   r  r  r  r;   r`   r   r'   r   r	  r   r  r  r  r  r   r   rP   r   r   r  r   r   r   )r   r   r$   r  r
  r  r  r  Zcur_heter_workerr  Zstage_idr  r   r  r   r  r   r   r   r  O  s   












z.ParameterServerLauncher.start_pod_heter_workerN)
r	   r
   r   r   r  r  r  r  r  r  r   r   r   r   rl  e  s    $  F KlDrl  c                 C   sP   | dvr
t d|  | dkrtj st d| dkr$tj s&t dd S d S )N)r7  r9  r8  autor6  Zxcclzpaddle.distributed initialize error, backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', 'heter', 'xccl' but got %sr7  zlpaddle.distributed initialize error, your paddle is not compiled with cuda but you assign 'nccl' as backend.r8  zkpaddle.distributed initialize error, your paddle is not compiled with xpu but you assign 'bkcl' as backend.)
ValueErrorr   r   r:  r   r<  r   r   r   check_backend  s   r  c                 C   s2   | dkrd S t jdrtdt jrtdd S )Nr9  darwinzDYou are going to using gloo on macos, but currently is not supportedzFYou are going to using gloo on windows, but currently is not supported)utilsZOS_NAME
startswithr  Z
IS_WINDOWSr<  r   r   r   block_windows_and_macos  s   r  c                   C   s    t j rdS t j rdS dS )Nr7  r8  r9  )r   r   r:  r   r   r   r   r   get_backend_by_compile_flag  s
   

r  )ri   r   r   )NN)r   )<r4   r\  rj   rC  r   r  r   r   r   r  r   r  r   
contextlibr   Zdistutils.utilr   Z*paddle.utils.cpp_extension.extension_utilsr  Zcpp_extensionZextension_utilsZpaddler   rk   r`   	propagater   r   r   rJ   rO   rV   rr   r   r   r   r   r   r   r   r   r   r   r  r  r  r/  r5  r>  rF  rH  rI  rM  rb  rf  rk  rl  r  r  r  r   r   r   r   <module>   sx   

I 
X*&	
)


~-0
8?:R      L