o
    "jA                     @   s0  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ ddlmZ eddZd	d
 Zdd ZG dd dZG dd dZG dd dZG dd dZG dd dZdd Zdd Zdd Zdd Zdd  Zd-d!d"ZG d#d$ d$Z	d-d%d&Zd'd( Zd)d* Z d+d, Z!dS ).    N)closing)	strtobool)get_backend_by_compile_flag   )
get_loggerINFOrootc                    s   dd | j dD }| j}||}td| d| d|  d }| js@t|dkr@| jd u r@t	t|}|d ur?t
|}nd}| jd urJ| j}t
t||t| }g }|D ] | fd	d|D  qYt||||S )
Nc                 S      g | ]}|  qS  strip.0xr
   r
   f/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/utils/launch_utils.py
<listcomp>!       z)get_cluster_from_args.<locals>.<listcomp>,zparsed from args:node_ips:z	 node_ip:z node_rank:   i  c                    s   g | ]}d  |f qS )z%s:%dr
   )r   portipr
   r   r   =   s    )Zcluster_node_ipssplitnode_ipindexloggerdebugZuse_paddlecloudlenstarted_portfind_free_portslistrangeappendget_cluster)argsselected_gpusnode_ipsr   	node_rankZ
free_portsr   trainer_endpointsr
   r   r   get_cluster_from_args    s2   


r)   c                    s   | d u rddl m} | }dd td|D }|S td}|d u s'|dkr3dd | dD }|S |d | dD ]}| v sKJ d	||q= fd
d| dD }t	d|  d| d   |S )Nr   corec                 S      g | ]}t |qS r
   strr   r
   r
   r   r   F   r   zget_gpus.<locals>.<listcomp>ZCUDA_VISIBLE_DEVICES c                 S   r	   r
   r   r   r
   r
   r   r   J   r   r   z=Can't find your selected_gpus {} in CUDA_VISIBLE_DEVICES[{}].c                    s   g | ]	}  | qS r
   )r   r   r   Zcuda_visible_devices_listr
   r   r   W   s    z1Change selected_gpus into reletive values. --ips:z will change into relative_ips:z( according to your CUDA_VISIBLE_DEVICES:)
paddle.frameworkr+   Zget_cuda_device_countr!   osgetenvr   formatr   info)r%   r+   Zgpus_numgpusZcuda_visible_devicesr   r
   r0   r   get_gpusA   s8   



r7   c                   @   4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )Hdfsc                 C   s   d | _ d | _d | _d S Nhdfs_ugi	hdfs_name	hdfs_pathselfr
   r
   r   __init__e      
zHdfs.__init__c                 C   s   | j d uo| jd uo| jd uS r:   r;   r?   r
   r
   r   is_validj   s
   
zHdfs.is_validc                 C   s   d | j| j| jS )Nz$hdfs_ugi:{} hdfs_name:{} hdfs_path{})r4   r<   r=   r>   r?   r
   r
   r   __str__q   s   zHdfs.__str__c                 C   s$   | j |j ko| j|jko| j|jkS r:   r;   r@   nr
   r
   r   __eq__v   s
   

zHdfs.__eq__c                 C   
   | |k S r:   r
   rE   r
   r
   r   __ne__}      
zHdfs.__ne__N)__name__
__module____qualname__rA   rC   rD   rG   rI   r
   r
   r
   r   r9   d   s    r9   c                   @   s\   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd ZdS )Clusterc                 C   s   d | _ g | _d | _d | _d S r:   )
job_serverpodshdfsjob_stage_flag)r@   rQ   r
   r
   r   rA      s   
zCluster.__init__c                 C   s"   d | jdd | jD | j| jS )Nz/job_server:{} pods:{} job_stage_flag:{} hdfs:{}c                 S   r,   r
   r-   )r   podr
   r
   r   r      r   z#Cluster.__str__.<locals>.<listcomp>)r4   rO   rP   rR   rQ   r?   r
   r
   r   rD      s   zCluster.__str__c                 C   sR   t | jt |jkrdS t| j|jD ]\}}||kr dS q| j|jkr'dS dS NFT)r   rP   ziprR   )r@   clusterabr
   r
   r   rG      s   zCluster.__eq__c                 C   s   |  | S r:   )rG   r@   rV   r
   r
   r   rI         zCluster.__ne__c                 C   s   t  |j| _d S r:   )copyrP   rY   r
   r
   r   update_pods   s   zCluster.update_podsc                 C   s   t |  S r:   )r   trainers_endpointsr?   r
   r
   r   trainers_nranks   rZ   zCluster.trainers_nranksc                 C   s
   t | jS r:   )r   rP   r?   r
   r
   r   pods_nranks   rJ   zCluster.pods_nranksc                 C   s,   g }| j D ]}|jD ]}||j q
q|S r:   )rP   trainersr"   endpoint)r@   rrS   tr
   r
   r   r]      s   

zCluster.trainers_endpointsc                 C   sR   g }| j D ]!}|j d|j }|jd ur|jd us!J | d|| q|S )N:z not a valid endpoint)rP   addrr   r"   )r@   rb   rS   epr
   r
   r   pods_endpoints   s   
zCluster.pods_endpointsc                 C   s*   | j D ]}t|t|jkr|  S qd S r:   )rP   r.   id)r@   Zpod_idrS   r
   r
   r   get_pod_by_id   s
   
zCluster.get_pod_by_idN)rK   rL   rM   rA   rD   rG   rI   r\   r^   r_   r]   rg   ri   r
   r
   r
   r   rN      s    rN   c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
	JobServerc                 C   s
   d | _ d S r:   ra   r?   r
   r
   r   rA      rJ   zJobServer.__init__c                 C   s   | j  S r:   rk   r?   r
   r
   r   rD      s   zJobServer.__str__c                 C   s   | j |jkS r:   )Zendpintra   r@   jr
   r
   r   rG      rZ   zJobServer.__eq__c                 C   rH   r:   r
   rl   r
   r
   r   rI      rJ   zJobServer.__ne__N)rK   rL   rM   rA   rD   rG   rI   r
   r
   r
   r   rj      s
    rj   c                   @   r8   )Trainerc                 C   s   g | _ d | _d | _d S r:   r6   ra   rankr?   r
   r
   r   rA      rB   zTrainer.__init__c                 C   s   d| j  d| j d| j S )Nzgpu:z
 endpoint:z rank:ro   r?   r
   r
   r   rD      s   zTrainer.__str__c                 C   s^   t | jt |jkrdS | j|jks| j|jkrdS t| j|jD ]\}}||kr, dS q!dS rT   )r   r6   ra   rp   rU   )r@   rc   rW   rX   r
   r
   r   rG      s   zTrainer.__eq__c                 C   rH   r:   r
   )r@   rc   r
   r
   r   rI      rJ   zTrainer.__ne__c                 C   s   | j S r:   )rp   r?   r
   r
   r   get_rank   s   zTrainer.get_rankN)rK   rL   rM   rA   rD   rG   rI   rq   r
   r
   r
   r   rn      s    rn   c                   @   s<   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd ZdS )Podc                 C   s(   d | _ d | _d | _d | _g | _g | _d S r:   )rp   rh   re   r   r`   r6   r?   r
   r
   r   rA         
zPod.__init__c              	   C   s*   d | j| j| j| j| jdd | jD S )Nz8rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}c                 S   r,   r
   r-   )r   rc   r
   r
   r   r      r   zPod.__str__.<locals>.<listcomp>)r4   rp   rh   re   r   r6   r`   r?   r
   r
   r   rD      s   zPod.__str__c                 C   s   | j |j ks| j|jks| j|jks| j|jkr%td|  d|  dS t| jt|jkr>td| j d|j  dS tt| jD ] }| j| |j| kretd| j|  d|j|    dS qEdS )Nzpod z != Fz	trainers ztrainer T)	rp   rh   re   r   r   r   r   r`   r!   )r@   rS   ir
   r
   r   rG     s   "z
Pod.__eq__c                 C   rH   r:   r
   )r@   rS   r
   r
   r   rI     rJ   z
Pod.__ne__c                 C   s   d S r:   r
   )r@   Zres_podsr
   r
   r   parse_response  s   zPod.parse_responsec                 C   sF   d}| j D ]	}|| d7 }q|dksJ d|  d|d d }|S )Nr/   r   z	this pod z can't see any gpus)r6   )r@   rb   gr
   r
   r   get_visible_gpus  s   
zPod.get_visible_gpusN)	rK   rL   rM   rA   rD   rG   rI   ru   rx   r
   r
   r
   r   rr      s    rr   c                 C   s   t |tu s
J dtd d}d}t| D ]K\}}t }||_||_|| }	t|	t|ks2J dtt|D ]!}
t	 }|j
||
  d|	|
  |_||_|d7 }|j| q8|j| q| |}||j| fS )Nztrainer_endpoints must be list)rQ   r   zOcurrent trainer_endpoints size should be greater equal than selected_gpus size.%sr   )typer    rN   	enumeraterr   rp   re   r   r!   rn   r6   r"   ra   r`   rP   r   )r&   r   r(   r%   rV   Ztrainer_rankr'   r   rS   Zcur_node_endpointsrt   trainerZpod_rankr
   r
   r   r#   )  s.   

r#   c                 C   s   | D ] }|j  d u r"|j   |jr|j  td|j j  qt	d t
ddD ]*}d}| D ]}|j  d u rGt|j jtj d}q3|sRtd  d S t	d q-td td	 d S )
Nzterminate process id:   r   2   FTzterminate all the procszcan't kill all process and exitr   )procpoll	terminatelog_fncloser   r   pidtimesleepr!   r2   killsignalSIGKILLr5   fatalsysexit)procspstepaliver
   r
   r   terminate_local_procsD  s*   




r   c                  C   s*   zt  } t | }| |fW S    Y d S r:   )socketgethostnamegethostbyname)Z	host_namehost_ipr
   r
   r   get_host_name_ip_  s   

r   c                 K   s6   |t krtn|}|jd|  f|||d d| dS )ab  Add argparse's argument.
    Examples:
        .. code-block:: python

            >>> import argparse
            >>> from paddle.distributed.utils import launch_utils
            >>> parser = argparse.ArgumentParser()
            >>> launch_utils.add_arguments("name", str, "Jonh", "User name.", parser)
            >>> args = parser.parse_args()

    z--z Default: %(default)s.)defaultrz   helpN)boolr   add_argument)argnamerz   r   r   Z	argparserkwargsr
   r
   r   add_argumentsh  s   
r   c                 C   sZ   dd }t  }d}	 | }||vr|| t|| kr|S |d7 }|dkr,td d S q
)Nc                  S   sN   t ttjtj} | d |  d W  d    S 1 s w   Y  d S )N)r/   r   r   )r   r   AF_INETSOCK_STREAMbindgetsockname)sr
   r
   r   __free_port  s   

$z$find_free_ports.<locals>.__free_portr   Tr   d   z?can't find avilable port and use the specified static port now!)setaddr   print)numr   Zport_setr   r   r
   r
   r   r   ~  s    
r   c                 C   sh  |d u rt  }|dkr/dddd |jD  d|j d|j d|   d|  d}|S |dkrWddd	d |jD  d|j d|j d|   d|  d
}|S |dkrtd|j d|j d|   d|  |d}|S |dkrddlm} |	 d }d| ddddd |jD  dd|j dd|j dd|   dd|  i}|S t
d)NZbkclry   r   c                 S   r,   r
   r-   r   rw   r
   r
   r   r     r   z(_prepare_trainer_env.<locals>.<listcomp>z%d)ZFLAGS_selected_xpusPADDLE_TRAINER_IDPADDLE_CURRENT_ENDPOINTPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSZncclc                 S   r,   r
   r-   r   r
   r
   r   r     r   )ZFLAGS_selected_gpusr   r   r   r   Zgloo)r   r   r   r   ZPADDLE_DISTRI_BACKENDZxcclr   r*   ZFLAGS_selected_r   c                 S   r,   r
   r-   r   r
   r
   r   r     r   r   r   r   r   z)backend must be one of 'gloo, nccl, bkcl')r   joinr6   rp   ra   r^   r]   r1   r+   Zget_all_custom_device_type
ValueError)rV   r|   backendproc_envr+   Zcustom_device_namer
   r
   r   _prepare_trainer_env  sV   
)
 


r   c                   @   s   e Zd Zdd ZdS )TrainerProcc                 C   s(   d | _ d | _d | _d | _d | _d | _d S r:   )r   r   
log_offsetrp   
local_rankcmdr?   r
   r
   r   rA     rs   zTrainerProc.__init__N)rK   rL   rM   rA   r
   r
   r
   r   r     s    r   c                 C   s  t  tj  }|dd  |dd  g }t|jD ]q\}}t| |}	||	 t	d|  t
jd|g| }
td|
 d|	  d }|d urdtj|dd td	||f d
}tj|
|||d}ntj|
|d}t }||_|j|_||_||_|r| nd |_|
|_|| q|S )NZ
http_proxyZhttps_proxyztrainer proc env:z-uzstart trainer proc:z env:T)exist_okz%s/workerlog.%drW   )envstdoutstderr)r   )r[   r2   environpopr{   r`   r   updater   r   r   
executabler5   makedirsopen
subprocessPopenr   r   rp   r   r   tellr   r   r"   )rV   rS   Ztraining_scriptZtraining_script_argsZlog_dirZcurrent_envr   idxrc   r   r   fnr   tpr
   r
   r   start_local_trainers  s2   

r   c              
   C   s   | j rIt| j jd5}|| jd |D ]}ztj| W q ty1   tjd| j j  Y qw |	 | _W d    d S 1 sBw   Y  d S d S )Nrb   r   zSUnicodeEncodeError occurs at this line. Please refer to the original log file "%s"
)
r   r   nameseekr   r   r   writeUnicodeEncodeErrorr   )r   Zfinliner
   r
   r   pull_worker_log  s    "r   c              	   C   s   z@d}g }d}| D ]&}|j r|jdkrt| |j }|d u r#d}q	|dkr/d}||j q	|r>t|  t	d W |S W |S  t
yQ   td t|    tye   td|| t|      td|| t|   )NFr   Tr   zKeyboardInterrupt, exitzdABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.)r   r   r   r   r   r"   rp   r   r   r   KeyboardInterruptr   warning
SystemExiterrorr4   )r   Znranksr   Z
error_rankr   r   retr
   r
   r   watch_local_trainers  sP   

r   c                 C   s@   t d tt|  D ]\}}t | d|  qt d d S )Nz0-----------  Configuration Arguments -----------z: z0------------------------------------------------)r   sortedvarsitems)r$   argvaluer
   r
   r   _print_arguments:  s   r   r:   )"r[   r2   r   r   r   r   r   
contextlibr   Zdistutils.utilr   Z%paddle.distributed.fleet.launch_utilsr   Zutils.log_utilsr   r   r)   r7   r9   rN   rj   rn   rr   r#   r   r   r   r   r   r   r   r   r   r   r
   r
   r
   r   <module>   s>   
!#B;	
0
+-