o
    "j                     @   sH   d dl Z d dlZd dlZd dlmZ ddlmZmZ G dd deZdS )    N)	Container   )CollectiveControllerControleModec                   @   sL   e Zd Zedd Zdd Zdd Zdd Zd	i d
d	d	fddZdd Z	d	S )IPUControllerc                 C   s2   |j jdkr|j| j d tj|j _dS dS )NZipuz enabledTF)argstraining_scriptloggerdebug__name__r   ZIPUZrun_mode)clsctx r   u/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/launch/controllers/ipu_controller.pyenable   s
   
zIPUController.enablec                 C   s   t  }|jdtdd |jdtdd |jdtdd |jdtd	d |jd
tdd |jdtdd |jdt jd ||S )Nz--hostsz&The hosts for IPU distributd training.)typehelpz--nproc_per_hostz*The number of processes launched per host.z--ipus_per_replicaz)The number of IPUs requested per replica.z--ipu_partitionz"The partition name of IPU devices.z--vipu_serverz!The ip of the IPU device manager.r   zoThe full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``.training_script_args)nargs)argparseArgumentParseradd_argumentstrint	REMAINDER
parse_args)selfZ	args_listparserr   r   r   parse_ipu_args"   s:   
zIPUController.parse_ipu_argsc              	   C   sf  d| j j_| | j jj}t| j jj}||j dks$J d||j||j }| j j	
d| d t|jd}||j }| j j	
d| d || dksZJ d|||jd	d
d}dd |D }g }|d|  |d|  |d|j  |dd| |d|j  |d|j  |g d d}	tdd }
|
r|	d|
 d	7 }	|	d|d|7 }	|	d7 }	||	 t|D ]}|||j  }||j }|d|||| q|tj ||j ||j td td tt|d D ]}td||   qtd|t|d    td || j j_d S ) NZpoprunr   zEThe number of IPUs:{} mod the number of IPUs per replica:{} must == 0z The number of total replicas is .,z!The number of total processes is zBThe number of replicas:{} mod the number of processes:{} must == 0  c                 S   s   g | ]}|d  qS )z:8090r   ).0xr   r   r   
<listcomp>\   s    z9IPUController.replace_training_script.<locals>.<listcomp>z--num-instances=z--num-replicas=z--ipus-per-replica=z	--host={}z--vipu-partition=z--vipu-server-host=)z--update-partition=noz--vipu-server-timeout=120z--print-topology=yesz--numa-aware=yesz--mpi-local-args='ZPOPART_LOG_LEVELz-x POPART_LOG_LEVEL=z8-x PADDLE_TRAINERS_NUM={} -x PADDLE_TRAINER_ENDPOINTS={}'zn--instance-mpi-local-args={}:"-x PADDLE_TRAINER_ID={} -x PADDLE_CURRENT_ENDPOINT={} -x PADDLE_RANK_IN_NODE={}"z'-----------  PopRun Command -----------zpoprun \r   z%s \z%sz'---------------------------------------)r   r   r   r   r   r   ZdevicesZipus_per_replicaformatr	   infolenhostssplitZnproc_per_hostreplaceappendjoinZipu_partitionZvipu_serverextendosgetenvrangesys
executableprint)r   Zpoprun_argsZnum_ipusZnum_replicasZ	num_nodesZ	num_procsr*   Z	endpointsZpoprun_commandZglobal_envsZ	log_levelidxZcur_endpointZrank_in_nodeir   r   r   replace_training_scriptA   s   







z%IPUController.replace_training_scriptc                 C   s,   | j jjg}|| j jj d|g}|S )Nr!   )r   r   r   r/   r   r.   )r   
entrypointr   r   r   _get_entrypoint   s   zIPUController._get_entrypointNTc                 C   sJ   t |p|  |r| j ni d}| ||\|_|_|| d|_|S )N)r9   envT)	r   r:   r   Zget_envsZ_get_out_err_fileoutfileZerrfileZ
update_envshell)r   r9   ZenvsZuse_ctx_envouterrcr   r   r   new_container   s   

zIPUController.new_containerc                 C   s,   |    |   |   |   |   d S )N)r8   Z	build_jobZ	build_podZ
deploy_podwatch)r   r   r   r   run   s
   zIPUController.run)
r   
__module____qualname__classmethodr   r   r8   r:   rA   rC   r   r   r   r   r      s    
[

r   )	r   r0   r3   Z'paddle.distributed.launch.job.containerr   Z
collectiver   r   r   r   r   r   r   <module>   s   