Spaces:
Build error
Build error
File size: 1,662 Bytes
546a9ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import torch
def distributed(opt, is_nocuda):
cluster = opt["cluster"]
world_size = 1
local_size = 1
rank = 0
local_rank = 0
is_master = True
run = None
if is_nocuda or not torch.cuda.is_available():
device = torch.device("cpu")
n_gpu = 0
else:
if "OMPI_COMM_WORLD_SIZE" in os.environ:
world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
local_size = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"])
rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
is_master = rank == 0
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
n_gpu = 1
# the following assumes that all processes run on a single node
if torch.distributed.is_available() and world_size > 1:
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["RANK"] = str(rank)
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = (
opt["master_port"] if "master_port" in opt else "35551"
)
torch.distributed.init_process_group(
backend="nccl"
) # using environment variable initialization
print("Distributed package is available. Process group initialized.")
return device, n_gpu, world_size, local_size, rank, local_rank, is_master, run
|