Spaces:
Build error
Build error
File size: 5,006 Bytes
67a8158 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import timeit
import numpy as np
import os
import os.path as osp
import shutil
import copy
import torch
import torch.nn as nn
import torch.distributed as dist
from .cfg_holder import cfg_unique_holder as cfguh
from . import sync
print_console_local_rank0_only = True
def print_log(*console_info):
local_rank = sync.get_rank('local')
if print_console_local_rank0_only and (local_rank!=0):
return
console_info = [str(i) for i in console_info]
console_info = ' '.join(console_info)
print(console_info)
if local_rank!=0:
return
log_file = None
try:
log_file = cfguh().cfg.train.log_file
except:
try:
log_file = cfguh().cfg.eval.log_file
except:
return
if log_file is not None:
with open(log_file, 'a') as f:
f.write(console_info + '\n')
class distributed_log_manager(object):
def __init__(self):
self.sum = {}
self.cnt = {}
self.time_check = timeit.default_timer()
cfgt = cfguh().cfg.train
use_tensorboard = getattr(cfgt, 'log_tensorboard', False)
self.ddp = sync.is_ddp()
self.rank = sync.get_rank('local')
self.world_size = sync.get_world_size('local')
self.tb = None
if use_tensorboard and (self.rank==0):
import tensorboardX
monitoring_dir = osp.join(cfguh().cfg.train.log_dir, 'tensorboard')
self.tb = tensorboardX.SummaryWriter(osp.join(monitoring_dir))
def accumulate(self, n, **data):
if n < 0:
raise ValueError
for itemn, di in data.items():
if itemn in self.sum:
self.sum[itemn] += di * n
self.cnt[itemn] += n
else:
self.sum[itemn] = di * n
self.cnt[itemn] = n
def get_mean_value_dict(self):
value_gather = [
self.sum[itemn]/self.cnt[itemn] \
for itemn in sorted(self.sum.keys()) ]
value_gather_tensor = torch.FloatTensor(value_gather).to(self.rank)
if self.ddp:
dist.all_reduce(value_gather_tensor, op=dist.ReduceOp.SUM)
value_gather_tensor /= self.world_size
mean = {}
for idx, itemn in enumerate(sorted(self.sum.keys())):
mean[itemn] = value_gather_tensor[idx].item()
return mean
def tensorboard_log(self, step, data, mode='train', **extra):
if self.tb is None:
return
if mode == 'train':
self.tb.add_scalar('other/epochn', extra['epochn'], step)
if 'lr' in extra:
self.tb.add_scalar('other/lr', extra['lr'], step)
for itemn, di in data.items():
if itemn.find('loss') == 0:
self.tb.add_scalar('loss/'+itemn, di, step)
elif itemn == 'Loss':
self.tb.add_scalar('Loss', di, step)
else:
self.tb.add_scalar('other/'+itemn, di, step)
elif mode == 'eval':
if isinstance(data, dict):
for itemn, di in data.items():
self.tb.add_scalar('eval/'+itemn, di, step)
else:
self.tb.add_scalar('eval', data, step)
return
def train_summary(self, itern, epochn, samplen, lr, tbstep=None):
console_info = [
'Iter:{}'.format(itern),
'Epoch:{}'.format(epochn),
'Sample:{}'.format(samplen),]
if lr is not None:
console_info += ['LR:{:.4E}'.format(lr)]
mean = self.get_mean_value_dict()
tbstep = itern if tbstep is None else tbstep
self.tensorboard_log(
tbstep, mean, mode='train',
itern=itern, epochn=epochn, lr=lr)
loss = mean.pop('Loss')
mean_info = ['Loss:{:.4f}'.format(loss)] + [
'{}:{:.4f}'.format(itemn, mean[itemn]) \
for itemn in sorted(mean.keys()) \
if itemn.find('loss') == 0
]
console_info += mean_info
console_info.append('Time:{:.2f}s'.format(
timeit.default_timer() - self.time_check))
return ' , '.join(console_info)
def clear(self):
self.sum = {}
self.cnt = {}
self.time_check = timeit.default_timer()
def tensorboard_close(self):
if self.tb is not None:
self.tb.close()
# ----- also include some small utils -----
def torch_to_numpy(*argv):
if len(argv) > 1:
data = list(argv)
else:
data = argv[0]
if isinstance(data, torch.Tensor):
return data.to('cpu').detach().numpy()
elif isinstance(data, (list, tuple)):
out = []
for di in data:
out.append(torch_to_numpy(di))
return out
elif isinstance(data, dict):
out = {}
for ni, di in data.items():
out[ni] = torch_to_numpy(di)
return out
else:
return data
|