o
    "Õj·µ  ć                   @   s.   d dl mZ dadd Zedkre  dS dS )é   )ŚContextNc            c      C   s¬  t  at ” rddlm}  |  ”  dS t ” rAddl}ddl}ddl}ddl	}ddl
}ddl}ddlm} ddlm} ddlm}	m}
m}m}m} dd	lm} | ” }tjj d
”s_tdzttjjd}| |”}W d   n1 sww   Y  W n   td| d”}| |j ” |j! "|j! #tjj”|j! $tjj” %d”d  d”}|j&|dd}| |j ” | 'd”}| (|” | )|” tjj* d”rķ|j+ ,d”dkrä|j-ddddddtjj*g}n|j-dtjj*g}ntjj*g}| .tjj/” | 0tjj/”}tjj1sd}n	t2tjj1 %d”}tjj3}t4|t5r%t6| %d”d }nt6|}||d < ||d!< ||d   |d"< | ,d#d”sFd$d%i|d#< | ,d&d”}|j! "|j! #tjj”|j! $tjj” %d”d  d'”}g }d}|dkr/dd(l7m8} d)tjj9v sJ tjj9 :d)” %d”\}} ||| d*}!|! ;d+” |! <d” ddl=}"z|" >” }#|" ?|" @|#””}W n   d,}Y |d,ks»J |d#  ,d-d”r/| Ad.” d/| }$|! B|$|  Cd0””sé| Ad” |! B|$|  Cd0””rŁtD|! Ed/”}%t2|%}&|&|kr| Ad” |! B|$|  Cd0”” tD|! Ed/”}%t2|%}&|&|ksłtFd1d2 |%D }| Gd3t2| d4| d” | ,d5d6”}'|'|d5< |'t_Hd7|vrC|'n| ,d7”}(| ,d8d”})d9}*| }+d},d}-d:tj_I| 0t”}.| ,d;i ” ,d<d=”d=krē| 0|”}/d>|/d#< ||/}0|0 J” }1d}2|1r±| 0|.”ad? K|1d< |1d@ |1dA |1dB |1dC |1dD |1dE |1dF |1dG ”	}3|3tj_L|,d7 },dHt5|, }4|4tj_M|
||1|/}5|5tj_/tjN GdI K|4|3|1”” | GdI K|4|3|1”” | Ot”}6|6 P”  |tjjLdJ|dK d$ tjjM dLdM\}7}8}9|9d@ rtjN QdN|3 ” | QdN|3 ” d:|1dO< d|1|dK d$ < |8|1dP< |9d@ rCtjN QdQ|3 ” | QdQ|3 ” d:|1dO< d|1|dK d$ < dR|1dP< |9dS@ rb|9d@ sbtjN QdT|3 ” | QdT|3 ” d|1dP< |9su|7|1dO< |7|1|dK d$ < |8|1dP< |9d@ s|9d@ rn1|,|1dU< |1d< }2|+jRd¢i |1¤ |6jSdVdW |+ TdX” |0 J” }:| 0|:”}1|0 R|1” | AdY” |1s|2du r½tdZ K|/”|2|d; d<< |+ TdX” |+ U”  | ” };tjN Gd[|;|  d\” | Gd[|;|  d\” ||}<| Gd]t2|<jVjW d^” |< J” }=|< R|=” |=dus
J d_|=
r| ” }>| 0|.”a|*r|(t_HdV}*d<|=v r'|=d< n|d; d< }?|?|=d@  |=dC  |=dE  }@|@|=d`< |?|=d<< da|=v rodb K|?|=d@ |=dA |=dB |=dc |=dC |=dD |=dE |=dF |=dG |=d` |=da ”}3n#dd K|?|=d@ |=dA |=dB |=dc |=dC |=dD |=dE |=dF |=dG |=d` ”}3|j! "|j! #tjj”|3”tj_L|,d7 },det5|, }4|4tj_M|
||=|}A|Atj_/tjN Gdf|4 dg|3 ” | Gdf|4 dg|3 ” |t|=|at6tjj3 %d”d }B|r÷|d|B }C||Cvr÷|! ,dh|3 ”d }=| ” }D|=s)| ” }E|E|D |d5 di krtdj|3 dk| AdY” |! ,dh|3 ”d }=|=r | Gdl|3 dm” tjN Gdl|3 dm” | X|= Y” ”}=|<jZ [d:” |< R|=” |+jRd¢i |=¤ |+j\|dK d$ |dK dn do\}F}9|9s|tjN Gdp|F ” | Gdp|F ” ntjN Gdq” | Gdq” da|=v r|=da r|	|=||+j] |=dr }G|Gr£|-d7 }-|=ds }Ht2|<jVjW}I|<jVj^}JtjN Gdt K|J|I|J|, |-|Ht_|I|J |' du d”” | Gdt K|J|I|J|, |-|Ht_|I|J |' du d”” |+ T|” |< J” }K| 0|K”}=|< R|=” q
| Ot”}6|d# d$ d%krd-|d# v r|=dC dkrdv|j+dw< ndx|j+dw< |6 P”  | ” }Lt_|L|> d|=dy< tjN Gdz K|4|3|=dy ”” | Gdz K|4|3|=dy ”” |tjjLdJ|dK d$ tjjM dLdM\}7}8}9d9}M|9d@ }N|Bdkrdh|, d{| }$|Nr|! B|$dR Cd0””s| Ad” |! B|$dR Cd0””r~tjN Gd||$ ” | Gd||$ ” nt`|6d}rÖ|6jad~krÖ|! B|$d Cd0””sÄ| Ad” |! B|$d Cd0””rµtjN Gd|$ ” | Gd|$ ” nbt`|6d}s|6jbjcdkr|! B|$d Cd0””sü| Ad” |! B|$d Cd0””rķtjN Gd|$ ” | Gd|$ ” n*|! B|$d Cd0””s'| Ad” |! B|$d Cd0””rtjN Gd|$ ” | Gd|$ ” tD|! Edh|, d{”}Ot2|O}&|&|Bkre| Ad” tD|! Edh|, d{”}Ot2|O}&|&|BksLdd2 |OD }PtjN Gd|, d|P ” | Gd|, d|P ” dR|Pv rdV}Md9}Nnd|PvrdV}MdV}G|9d@ rÅtjN Qd|3 d” | Qd|3 d” d:|=dO< d|=|dK d$ < |Nsæ|8ndR|=dP< d9}G|9d@ rķtjN Q|3 d” | Q|3 d” d:|=dO< d|=|dK d$ < dR|=dP< d9}G|9dS@ r|9d@ stjN Qd|3 d” | Qd|3 d” |NsdndR|=dP< |Gs.|Mr.|7|=dO< |7|=|dK d$ < |Ns*|8ndR|=dP< |GsI|MsId:|=dO< d|=|dK d$ < |NsEdndR|=dP< |dK d$ |=vrZd|=|dK d$ < |,|=dU< d|d# v räd|d# d v räda|=vrä|=|dK d$  }Q|d# d d }R|tjjLdJ|Rd}S|d# d  ,ddg”}Tt6|d# d  ,dd”}U|d# d  ,ddV”}V|TD ]4}W|Vr»|Ud |W }Xn|UdS |W }X|QrŅ|SrŅt_|S|S|X  |Q d.nd}Y|Y|=d|W d|dK d$  < qÆ|G|=dr< |Grļ|-d7 }-d}H|H|=ds< t2|<jVjW}I|<jVj^}JtjN Gdt K|J|I|J|, |-|Ht_|I|J |' du d”” | Gdt K|J|I|J|, |-|Ht_|I|J |' du d”” |	rl|d }||k	rl|! Bdh|3 | d|=” Cd0””	sa| Ad” |! Bdh|3 | d|=” Cd0””	rL| G| d|3 d” |+jRd¢i |=¤ |+j\|dK d$ |dK dn do\}F}9|9	stjN Gdp|F ” | Gdp|F ” ntjN Gd” | Gd” da|=v 	r¶|=da 	r¶|	|=||+j] |+ T|” |6jSdVdW |< J” }K| 0|K”}=|< R|=” t5| e” }Z| fd” g” }[|[D ]}\t5|\ :” }]|]|Zk	rņ| hd|] ” 	qŽ| AdY” | ” };|)
r
|;| t6|)k
r
n|=s|+ T|” d}^| 0|.”a|dk
r„|j+ ,dd”}_|_du
s,J ||_k
r^|+j\|dK d$ |dK dn |d\}^}9|9
rJtd| d|^”}`|! Bd+|`”
s]| Ad” 
qOn`tidD ]<}az|! ,d+”d  Y” }`| X|`”}^W n# tj
y }b ztjN Q|b” | Q|b” | Ad” W Y d}b~bnd}b~bww |^
r n
qb|^
s¤J n|+j\|dK d$ |dK dn |d\}^}9|9
r¾td|^
rČ|^dO d:k
sŹJ | ” };tjN Gd|;|  d\” | Gd|;|  d\” | ,dd9”
rõ|d#  ,d-d”
rł| k”  |
||^|d9d }Ad9t_l|Atj_/d+tj_MtjN Gd”|^ ” | Gd”|^ ” |j! "|j! #tjj”d+” tj_Ltj_L| Ot”}6|6 P”  |6jSd9dW dS dd	lm} | Ot”}6|6 P”  |6 S”  dS )£av4  
    Paddle distribution training entry ``python -m paddle.distributed.launch``.

    Usage:
        .. code-block:: bash
            :name: code-block-bash1

            python -m paddle.distributed.launch [-h] [--master MASTER] [--rank RANK]
                   [--log_level LOG_LEVEL] [--nnodes NNODES]
                   [--nproc_per_node NPROC_PER_NODE] [--log_dir LOG_DIR]
                   [--run_mode RUN_MODE] [--job_id JOB_ID] [--devices DEVICES]
                   [--host HOST] [--servers SERVERS] [--trainers TRAINERS]
                   [--trainer_num TRAINER_NUM] [--server_num SERVER_NUM]
                   [--gloo_port GLOO_PORT] [--with_gloo WITH_GLOO]
                   [--max_restart MAX_RESTART] [--elastic_level ELASTIC_LEVEL]
                   [--elastic_timeout ELASTIC_TIMEOUT]
                   training_script ...


    Base Parameters:
        - ``--master``: The master/rendezvous server, support ``http://`` and ``etcd://``, default with ``http://``. e.g., ``--master=127.0.0.1:8080``. Default ``--master=None``.

        - ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.

        - ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. Default ``--log_level=INFO``.

        - ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnodes=2:3``. Default ``--nnodes=1``.

        - ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system.  e.g., ``--nproc_per_node=8``

        - ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.

        - ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter/rpc. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.

        - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.

        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.

        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``

        - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``

    Collective Parameters:
        - ``--ips``: [DEPRECATED] Paddle cluster nodes ips, e.g., ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.

    Parameter-Server Parameters:
        - ``--servers``: User defined servers ip:port, e.g., ``--servers="192.168.0.16:6170,192.168.0.17:6170"``

        - ``--trainers``: User defined trainers ip:port, e.g., ``--trainers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``

        - ``--workers``: [DEPRECATED] The same as trainers.

        - ``--trainer_num``: Number of trainers on each node, can be 0.

        - ``--worker_num``: [DEPRECATED] The same as trainer_num.

        - ``--server_num``: Number of servers on each node, can be 0.

        - ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``

        - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)

        - ``--heter_devices``: Type of heter_device in each stage

        - ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.

        - ``--with_gloo``: Using gloo or not. Default ``--with_gloo=0``.

    Elastic Parameters:
        - ``--max_restart``: The maximum restart times for an elastic job. Default ``--max_restart=3``.

        - ``--elastic_level``: The elastic level: -1: disable, 0: failed exit, peers hold, 1: internal restart. Default ``--elastic_level=-1``.

        - ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``.

    IPU Parameters:
        IPU distributed launch only requires and allowes three arguments ``--devices``, ``training_script`` and ``training_script_args``.
        The ``--devices`` is the number of IPU devices. e.g., ``--devices=4`` will launch the training program with four IPU devices.
        The ``training_script`` is only allowed to set as ``ipu``.
        The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below.
        ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs.

        - ``--hosts``: The hosts for IPU distributd training. Each host is able to include multiple processes.

        - ``--nproc_per_host``: The number of processes launched per host. Each process is able to include multiple replicas.

        - ``--ipus_per_replica``: The number of IPUs requested per replica. Each replica is able to include multiple IPUs.

        - ``--ipu_partition``: The partition name of IPU devices.

        - ``--vipu_server``: The ip of the IPU device manager.

        - ``training_script``: The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``.

        - ``training_script_args``: The args of the IPU distributed training program/script. e.g., ``--lr=0.1``.

    Returns:
        - ``None``

    Examples 0 (master, ip/port auto detection):
        .. code-block:: bash
            :name: code-block-example-bash0

            # For training on multi node, run the following command in one of the nodes

            python -m paddle.distributed.launch --nnodes 2 train.py

            # Then the following info will be print

            # Copy the following command to other nodes to run.
            # --------------------------------------------------------------------------------
            # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
            # --------------------------------------------------------------------------------

            # Follow the instruction above and paste the command in other nodes can launch a multi nodes training job.

            # There are two ways to launch a job with the same command for multi nodes training
            # 1) using the following command in every nodes, make sure the ip is one of the training node and the port is available on that node
            # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
            # 2) using the following command in every nodes with a independent etcd service
            # python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2 train.py

            # This functionality works will for both collective and ps mode and even with other arguments.


    Examples 1 (collective, single node):
        .. code-block:: bash
            :name: code-block-example-bash1

            # For training on single node using 4 gpus.

            python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01

    Examples 2 (collective, multi node):
        .. code-block:: bash
            :name: code-block-example-bash2

            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17

            # On 192.168.0.16:

            python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01

            # On 192.168.0.17:
            python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01

    Examples 3 (ps, cpu, single node):
        .. code-block:: bash
            :name: code-block-example-bash3

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.

            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01

    Examples 4 (ps, cpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash4

            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.

            # On 192.168.0.16:

            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

            # On 192.168.0.17:

            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

            # Or with master, the following command run 2 server and 2 trainer on each node.

            python -m paddle.distributed.launch --master 192.168.0.16:9090 --server_num=2 --trainer_num=2 --nnodes 2 train.py


    Examples 5 (ps, gpu, single node):
        .. code-block:: bash
            :name: code-block-example-bash5

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.

            export CUDA_VISIBLE_DEVICES=0,1,2,3
            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01

    Examples 6 (ps, gpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash6

            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.

            # On 192.168.0.16:

            export CUDA_VISIBLE_DEVICES=0,1
            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

            # On 192.168.0.17:

            export CUDA_VISIBLE_DEVICES=0,1
            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01

    Examples 7 (ps-heter, cpu + gpu, single node):
        .. code-block:: bash
            :name: code-block-example-bash7

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.

            export CUDA_VISIBLE_DEVICES=0,1
            python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01

    Examples 8 (ps-heter, cpu + gpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash8

            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server, 1 gpu worker, 1 cpu worker.

            # On 192.168.0.16:

            export CUDA_VISIBLE_DEVICES=0
            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01

            # On 192.168.0.17:

            export CUDA_VISIBLE_DEVICES=0
            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01

    Examples 9 (elastic):
        .. code-block:: bash
            :name: code-block-example-bash9

            # With the following command, the job will begin to run immediately if 4 nodes are ready,
            # or it will run after elastic_timeout if only 2 or 3 nodes ready
            python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py

            # once the number of nodes changes between 2:4 during training, the strategy holds

    Examples 10 (ipu):
        .. code-block:: bash
            :name: code-block-example-bash10

            # With the following command, the job will begin to run the distributhed program with IPUs
            # Require `devices` as the number of IPUs
            # Require `training_script` to be set as `ipu`
            # Require `training_script_args` as the arguments of IPU distributed training instead of the arguments of the training program/script
            # Please Check the `IPU Parameters` for details
            python -m paddle.distributed.launch --devices 4 ipu --hosts=localhost --nproc_per_host=2 --ipus_per_replica=1 --ipu_partition=pod16 --vipu_server=127.0.0.1 train.py

    Examples 11 (rpc, cpu, single node):
        .. code-block:: bash
            :name: code-block-example-bash11

            # Training on single node with two local servers
            python -m paddle.distributed.launch --master 127.0.0.1:8765 --nnodes 1 --nproc_per_node 2 --rank 0 --run_mode rpc train.py

    Examples 12 (rpc, cpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash12

            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 2 servers.

            # On 192.168.0.16

            python -m paddle.distributed.launch --master 192.168.0.16:8765 --nnodes 2 --nproc_per_node 2 --rank 0 --run_mode rpc train.py

            # On 192.168.0.17

            python -m paddle.distributed.launch --master 192.168.0.16:8765 --nnodes 2 --nproc_per_node 2 --rank 1 --run_mode rpc train.py

    é    )ŚlaunchNé   )ŚHistoryRecorder)Ś	AutoTuner)Śadd_overlap_performanceŚgen_new_argsŚgen_new_ctxŚread_logŚread_step_time_logr   )Ścontrollersz.jsonz+Please use '.json' as the file name suffix.Śrz0Please check your auto tuner json whether valid.Ś
auto_tunerŚ.z_auto_tuner.logŚw)Śmodez4%(asctime)s - %(name)s - %(levelname)s - %(message)sz.pyZWITH_COVERAGEŚONz-uz-mZcoverageŚrunz--branchz-pé   ś,ś:ŚnodesŚgpus_per_nodeZnum_gpusZsearch_algoŚnameŚgridr   z_history.csv)Ś
ETCDClientzetcd://)ŚhostŚportŚbest_cfgz	127.0.0.1Zestimated_num_gpusé   zauto_tuner/ip/zlatin-1c                 S   ó   g | ]}|d    ” qS ©r   ©Śdecode©Ś.0Śi© r(   ś_/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/distributed/launch/main.pyŚ
<listcomp>   ó    zlaunch.<locals>.<listcomp>zThe total count of nodes is z and sorted ips are Śmax_time_per_taski  Śwarmup_timeŚmax_search_timeTé’’’’Z	model_cfgŚglobal_batch_sizeŚautoZgbsz\GBSSearch/GBS{}_DP{}_MP{}_PP{}_Sharding_degree_{}_stage_{}_MBS{}_Recompute_{}_granularity_{}Z	dp_degreeZ	mp_degreeZ	pp_degreeZsharding_degreeZsharding_stageZmicro_batch_sizeZuse_recomputeZrecompute_granularityZ
gbs_tuner_z=Launch task from auto tuner: job_id {}, log_dir {}, config {}zworkerlog.0Z
metric_cfgz.gpu.log)ŚpathZmetric_fileŚtarget_metricZmemory_filez#Read metric failed for parameters: ŚtimeZmax_mem_usagezOut of memory for parameters: ZOOMé   z)Read memory usage failed for parameters: Śjob_idF)Śexitz./tuner_gbs_history.csvé   zTNo valid global batch size found, check memory or valid search time. cur_tuner_cfg{}z!AtuoTuner for GBS search ends in zs.zLaunch z tasks by auto tuner: zNo config can run.Ś	acc_stepsZsharding_overlapzdGBS{}_DP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}_AccStep{}_Overlap_{}Z
vpp_degreezYGBS{}_DP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}_AccStep{}Zauto_tuner_zLaunch task: job_id z
, log_dir zauto_tuner/é   zWait z failedzReceive that task z has ended by etcd.ZOptimizationDirection)ŚmetricŚ	directionzCurrent best config: z7Get best config failed. Currently no config can be run.Ś	has_errorŚ
error_infozaAuto Tuner Schedule: [{}/{}], Pruned nums {}, Error nums {}, Error info {}, Remaining time {} miné<   Ś1Z$FLAGS_shard_bypass_dygraph_optimizerŚ0Z	exec_timez(Task: job_id {}, log_dir {} ended in {}sś/zPut OOM to Śsiginté   ŚOKz
Put OK to ŚErrorzPut Error to c                 S   r!   r"   r#   r%   r(   r(   r)   r*   1  r+   zStatus of auto_tuner/z/: zRead metric of z failed.z OOM.zRead memory usage of Ś
conversionŚ	step_time)r2   Śfiler3   Ścomm_bwéd   Śmodel_size_bé   Śampé   Zbw_Ś_z put auto_tuner/z successfully.z-Get best config failed, no config can be run.z;fuser -v /dev/nvidia* |awk '{for(i=1;i<=NF;i++) print $i;}'zkill -9 ZCOLLECTIVE_MASTER_IP)r;   r<   r   zCGet best config failed. Currently there are no appropriate configs.é
   zAutoTuner ended in Śrun_best)rR   zLaunch best cfg: r(   )mr   ŚctxZis_legacy_modeZpaddle.distributed.fleetr   Zis_auto_tuner_modeŚcopyŚjsonŚloggingŚosŚsysr4   Zauto_tuner.recorderr   Zauto_tuner.tunerr   Zauto_tuner.utilsr   r	   r
   r   r   Ś r   ŚargsZauto_tuner_jsonŚendswithŚ
ValueErrorŚopenŚloadŚ	getLoggerŚsetLevelŚINFOr2   ŚjoinŚdirnameŚbasenameŚsplitŚFileHandlerŚ	FormatterŚsetFormatterŚ
addHandlerZtraining_scriptŚenvironŚgetŚ
executableŚextendZtraining_script_argsŚdeepcopyZdevicesŚlenŚnnodesŚ
isinstanceŚstrŚintZutils.etcd_clientr   ZmasterŚstripŚdeleteZdelete_prefixŚsocketŚgethostnameŚgethostbynameŚgetfqdnŚsleepŚputŚencodeŚlistZ
get_prefixŚsortedŚinfor,   Zmax_restartZsearch_onceŚformatŚlog_dirr6   ŚloggerŚinitr   ŚwarningZadd_cfgŚfinalizeZstore_historyZclean_historyŚalgoZ	all_tasksŚloadsr$   Zhistory_cfgsŚpopZget_bestŚhistoryŚidxŚroundŚhasattrrC   ZpodŚ	exit_codeŚdumpsŚgetpidŚpopenŚ	readlinesŚsystemŚrangeŚ	Exceptionr7   rR   )cr   rT   rU   rV   rW   rX   r4   r   r   r   r	   r
   r   r   r   Ś
start_timeŚfZ	tuner_cfgr   Zauto_tuner_log_pathŚhandlerŚ	formatterŚ
entrypointZraw_argsr   rp   r   Zhistory_file_pathZ
sorted_ipsŚipr   Z	master_ipr   Śclientrv   Śhostnamer2   ŚipsŚsizer,   r-   r.   Zis_first_taskZrecorderr6   Zerror_task_numsZraw_ctxZgbs_tuner_cfgZ	gbs_tunerZgbs_cur_cfgZbest_gbsr   Ztask_job_idZgbs_new_argsŚcr;   ZmemŚerrZgbs_new_cfgZend_timer   Zcur_cfgZtask_start_timer0   r9   Śnew_argsZactual_nnodesZactual_exec_ipsZwait_start_timeZwait_end_timeZcur_best_cfgsr=   r>   Z	task_numsZcur_task_idZnew_cfgZtask_end_timeZtimeout_flagZOOM_flagŚresultŚstatusZsingle_dp_performanceZstep_time_metricrH   rJ   rL   rN   ŚbwZ	comm_timeZmulti_dp_performaceZself_pidZ	processesŚprocessŚpidr   Zcollective_master_ipŚdatar'   Śer(   r(   r)   r      sL    
’
ž’

ų

ž





’


ü’
’ż
’

÷’’’’’

ü
’’
’’’




k’’
’’’


’
ż’žż’
ōõ’’

’
ų	
’
’

’
ž’’
’’’žüś’’žüś’





’’
’’
ü

’
’
’
’


ż





’ż’’’śś’žüųž’žś’žś’
ž
üž


ž



’ž

’   ź   






ż’

ž
ż’


ż’’’

r   Ś__main__)Ścontextr   rS   r   Ś__name__r(   r(   r(   r)   Ś<module>   s           *
’