akhaliq3
spaces demo
546a9ba
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import torch
from torch.utils.tensorboard import SummaryWriter
import random
import numpy as np
from pkg_resources import parse_version
from model.third_party.HMNet.Models.Trainers.BaseTrainer import BaseTrainer
from model.third_party.HMNet.Utils.GeneralUtils import bcolors
from model.third_party.HMNet.Utils.distributed import distributed
class DistributedTrainer(BaseTrainer):
def __init__(self, opt):
super().__init__(opt)
self.seed = int(self.opt["SEED"]) if "SEED" in self.opt else 0
random.seed(self.seed)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
(
self.opt["device"],
_,
self.opt["world_size"],
self.opt["local_size"],
self.opt["rank"],
self.opt["local_rank"],
_,
self.opt["run"],
) = distributed(opt, not self.use_cuda)
self.getSaveFolder()
self.opt["logFile"] = f"log_{self.opt['rank']}.txt"
self.saveConf()
self.high_pytorch_version = parse_version(torch.__version__) >= parse_version(
"1.2.0"
)
if self.opt["rank"] == 0:
print(
bcolors.OKGREEN,
torch.__version__,
bcolors.ENDC,
"is",
"high" if self.high_pytorch_version else "low",
)
if self.use_cuda:
# torch.cuda.manual_seed_all(self.seed)
# ddp: only set seed on GPU associated with this process
torch.cuda.manual_seed(self.seed)
# ddp: print stats and update learning rate
if self.opt["rank"] == 0:
print(
"Number of GPUs is",
bcolors.OKGREEN,
self.opt["world_size"],
bcolors.ENDC,
)
# print('Boost learning rate from', bcolors.OKGREEN, self.opt['START_LEARNING_RATE'], bcolors.ENDC, 'to',
# bcolors.OKGREEN, self.opt['START_LEARNING_RATE'] * self.opt['world_size'], bcolors.ENDC)
print(
"Effective batch size is increased from",
bcolors.OKGREEN,
self.opt["MINI_BATCH"],
bcolors.ENDC,
"to",
bcolors.OKGREEN,
self.opt["MINI_BATCH"] * self.opt["world_size"],
bcolors.ENDC,
)
self.grad_acc_steps = 1
if "GRADIENT_ACCUMULATE_STEP" in self.opt:
if self.opt["rank"] == 0:
print(
"Gradient accumulation steps =",
bcolors.OKGREEN,
self.opt["GRADIENT_ACCUMULATE_STEP"],
bcolors.ENDC,
)
# print('Boost learning rate from', bcolors.OKGREEN, self.opt['START_LEARNING_RATE'], bcolors.ENDC, 'to',
# bcolors.OKGREEN, self.opt['START_LEARNING_RATE'] * self.opt['world_size'] * self.opt['GRADIENT_ACCUMULATE_STEP'], bcolors.ENDC)
print(
"Effective batch size =",
bcolors.OKGREEN,
self.opt["MINI_BATCH"]
* self.opt["world_size"]
* self.opt["GRADIENT_ACCUMULATE_STEP"],
bcolors.ENDC,
)
self.grad_acc_steps = int(self.opt["GRADIENT_ACCUMULATE_STEP"])
# self.opt['START_LEARNING_RATE'] *= self.opt['world_size'] * self.grad_acc_steps
def tb_log_scalar(self, name, value, step):
if self.opt["rank"] == 0:
if self.tb_writer is None:
self.tb_writer = SummaryWriter(
os.path.join(self.saveFolder, "tensorboard")
)
self.tb_writer.add_scalar(name, value, step)
def log(self, s):
# When 'OFFICIAL' flag is set in the config file, the program does not output logs
if self.is_official:
return
try:
if self.logFileHandle is None:
self.logFileHandle = open(
os.path.join(self.saveFolder, self.opt["logFile"]), "a"
)
self.logFileHandle.write(s + "\n")
except Exception as e:
print("ERROR while writing log file:", e)
print(s)
def getSaveFolder(self):
runid = 1
while True:
saveFolder = os.path.join(
self.opt["datadir"],
self.opt["basename"] + "_conf~",
"run_" + str(runid),
)
if not os.path.isdir(saveFolder):
if self.opt["world_size"] > 1:
torch.distributed.barrier()
if self.opt["rank"] == 0:
os.makedirs(saveFolder)
self.saveFolder = saveFolder
if self.opt["world_size"] > 1:
torch.distributed.barrier()
print(
"Saving logs, model, checkpoint, and evaluation in "
+ self.saveFolder
)
return
runid = runid + 1
def saveConf(self):
if self.opt["rank"] == 0:
super().saveConf()