Spaces:
Sleeping
Sleeping
File size: 5,052 Bytes
05005db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Distributed process launcher.
This code is modified from https://github.com/pytorch/pytorch/blob/v1.3.0/torch/distributed/launch.py.
"""
import os
import subprocess
import sys
from argparse import ArgumentParser
from argparse import REMAINDER
def parse_args():
"""Parse arguments."""
parser = ArgumentParser(
description="PyTorch distributed training launch "
"helper utilty that will spawn up "
"multiple distributed processes"
)
# Optional arguments for the launch helper
parser.add_argument(
"--nnodes",
type=int,
default=1,
help="The number of nodes to use for distributed " "training",
)
parser.add_argument(
"--node_rank",
type=int,
default=0,
help="The rank of the node for multi-node distributed " "training",
)
parser.add_argument(
"--nproc_per_node",
type=int,
default=1,
help="The number of processes to launch on each node, "
"for GPU training, this is recommended to be set "
"to the number of GPUs in your system so that "
"each process can be bound to a single GPU.",
)
parser.add_argument(
"--master_addr",
default="127.0.0.1",
type=str,
help="Master node (rank 0)'s address, should be either "
"the IP address or the hostname of node 0, for "
"single node multi-proc training, the "
"--master_addr can simply be 127.0.0.1",
)
parser.add_argument(
"--master_port",
default=29500,
type=int,
help="Master node (rank 0)'s free port that needs to "
"be used for communciation during distributed "
"training",
)
parser.add_argument(
"--use_env",
default=False,
action="store_true",
help="Use environment variable to pass "
"'local rank'. For legacy reasons, the default value is False. "
"If set to True, the script will not pass "
"--local_rank as argument, and will instead set LOCAL_RANK.",
)
parser.add_argument(
"-m",
"--module",
default=False,
action="store_true",
help="Changes each process to interpret the launch script "
"as a python module, executing with the same behavior as"
"'python -m'.",
)
parser.add_argument(
"-c",
"--command",
default=False,
action="store_true",
help="Changes each process to interpret the launch script " "as a command.",
)
# positional
parser.add_argument(
"training_script",
type=str,
help="The full path to the single GPU training "
"program/script/command to be launched in parallel, "
"followed by all the arguments for the "
"training script",
)
# rest from the training program
parser.add_argument("training_script_args", nargs=REMAINDER)
return parser.parse_args()
def main():
"""Launch distributed processes."""
args = parse_args()
# world size in terms of number of processes
dist_world_size = args.nproc_per_node * args.nnodes
# set PyTorch distributed related environmental variables
current_env = os.environ.copy()
current_env["MASTER_ADDR"] = args.master_addr
current_env["MASTER_PORT"] = str(args.master_port)
current_env["WORLD_SIZE"] = str(dist_world_size)
processes = []
if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1:
current_env["OMP_NUM_THREADS"] = str(1)
print(
"*****************************************\n"
"Setting OMP_NUM_THREADS environment variable for each process "
"to be {} in default, to avoid your system being overloaded, "
"please further tune the variable for optimal performance in "
"your application as needed. \n"
"*****************************************".format(
current_env["OMP_NUM_THREADS"]
)
)
for local_rank in range(0, args.nproc_per_node):
# each process's rank
dist_rank = args.nproc_per_node * args.node_rank + local_rank
current_env["RANK"] = str(dist_rank)
current_env["LOCAL_RANK"] = str(local_rank)
# spawn the processes
if args.command:
cmd = [args.training_script]
else:
cmd = [sys.executable, "-u"]
if args.module:
cmd.append("-m")
cmd.append(args.training_script)
if not args.use_env:
cmd.append("--local_rank={}".format(local_rank))
cmd.extend(args.training_script_args)
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
for process in processes:
process.wait()
if process.returncode != 0:
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
if __name__ == "__main__":
main()
|