Fix redundant outputs via Logging in DDP training (#500)
Browse files* Change print to logging
* Clean function set_logging
* Add line spacing
* Change leftover prints to log
* Fix scanning labels output
* Fix rank naming
* Change leftover print to logging
* Reorganized DDP variables
* Fix type error
* Make quotes consistent
* Fix spelling
* Clean function call
* Add line spacing
* Update datasets.py
* Update train.py
Co-authored-by: Glenn Jocher <[email protected]>
- models/yolo.py +4 -2
- train.py +26 -24
- utils/datasets.py +13 -9
- utils/general.py +7 -0
- utils/torch_utils.py +6 -4
models/yolo.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import argparse
|
2 |
import math
|
|
|
3 |
from copy import deepcopy
|
4 |
from pathlib import Path
|
5 |
|
@@ -12,6 +13,7 @@ from utils.general import check_anchor_order, make_divisible, check_file
|
|
12 |
from utils.torch_utils import (
|
13 |
time_synchronized, fuse_conv_and_bn, model_info, scale_img, initialize_weights, select_device)
|
14 |
|
|
|
15 |
|
16 |
class Detect(nn.Module):
|
17 |
def __init__(self, nc=80, anchors=(), ch=()): # detection layer
|
@@ -169,7 +171,7 @@ class Model(nn.Module):
|
|
169 |
|
170 |
|
171 |
def parse_model(d, ch): # model_dict, input_channels(3)
|
172 |
-
|
173 |
anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
|
174 |
na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors
|
175 |
no = na * (nc + 5) # number of outputs = anchors * (classes + 5)
|
@@ -224,7 +226,7 @@ def parse_model(d, ch): # model_dict, input_channels(3)
|
|
224 |
t = str(m)[8:-2].replace('__main__.', '') # module type
|
225 |
np = sum([x.numel() for x in m_.parameters()]) # number params
|
226 |
m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
|
227 |
-
|
228 |
save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
|
229 |
layers.append(m_)
|
230 |
ch.append(c2)
|
|
|
1 |
import argparse
|
2 |
import math
|
3 |
+
import logging
|
4 |
from copy import deepcopy
|
5 |
from pathlib import Path
|
6 |
|
|
|
13 |
from utils.torch_utils import (
|
14 |
time_synchronized, fuse_conv_and_bn, model_info, scale_img, initialize_weights, select_device)
|
15 |
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
|
18 |
class Detect(nn.Module):
|
19 |
def __init__(self, nc=80, anchors=(), ch=()): # detection layer
|
|
|
171 |
|
172 |
|
173 |
def parse_model(d, ch): # model_dict, input_channels(3)
|
174 |
+
logger.info('\n%3s%18s%3s%10s %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
|
175 |
anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
|
176 |
na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors
|
177 |
no = na * (nc + 5) # number of outputs = anchors * (classes + 5)
|
|
|
226 |
t = str(m)[8:-2].replace('__main__.', '') # module type
|
227 |
np = sum([x.numel() for x in m_.parameters()]) # number params
|
228 |
m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
|
229 |
+
logger.info('%3s%18s%3s%10.0f %-40s%-30s' % (i, f, n, np, t, args)) # print
|
230 |
save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
|
231 |
layers.append(m_)
|
232 |
ch.append(c2)
|
train.py
CHANGED
@@ -3,6 +3,7 @@ import math
|
|
3 |
import os
|
4 |
import random
|
5 |
import time
|
|
|
6 |
from pathlib import Path
|
7 |
|
8 |
import numpy as np
|
@@ -23,13 +24,14 @@ from utils.datasets import create_dataloader
|
|
23 |
from utils.general import (
|
24 |
torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, labels_to_image_weights,
|
25 |
compute_loss, plot_images, fitness, strip_optimizer, plot_results, get_latest_run, check_dataset, check_file,
|
26 |
-
check_git_status, check_img_size, increment_dir, print_mutation, plot_evolution)
|
27 |
from utils.google_utils import attempt_download
|
28 |
from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts
|
29 |
|
|
|
30 |
|
31 |
def train(hyp, opt, device, tb_writer=None):
|
32 |
-
|
33 |
log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
|
34 |
wdir = str(log_dir / 'weights') + os.sep # weights directory
|
35 |
os.makedirs(wdir, exist_ok=True)
|
@@ -69,7 +71,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
69 |
state_dict = ckpt['model'].float().state_dict() # to FP32
|
70 |
state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect
|
71 |
model.load_state_dict(state_dict, strict=False) # load
|
72 |
-
|
73 |
else:
|
74 |
model = Model(opt.cfg, ch=3, nc=nc).to(device) # create
|
75 |
|
@@ -103,7 +105,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
103 |
|
104 |
optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
|
105 |
optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
|
106 |
-
|
107 |
del pg0, pg1, pg2
|
108 |
|
109 |
# Scheduler https://arxiv.org/pdf/1812.01187.pdf
|
@@ -128,7 +130,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
128 |
# Epochs
|
129 |
start_epoch = ckpt['epoch'] + 1
|
130 |
if epochs < start_epoch:
|
131 |
-
|
132 |
(weights, ckpt['epoch'], epochs))
|
133 |
epochs += ckpt['epoch'] # finetune additional epochs
|
134 |
|
@@ -145,7 +147,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
145 |
# SyncBatchNorm
|
146 |
if opt.sync_bn and cuda and rank != -1:
|
147 |
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
|
148 |
-
|
149 |
|
150 |
# Exponential moving average
|
151 |
ema = ModelEMA(model) if rank in [-1, 0] else None
|
@@ -156,7 +158,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
156 |
|
157 |
# Trainloader
|
158 |
dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True,
|
159 |
-
cache=opt.cache_images, rect=opt.rect,
|
160 |
world_size=opt.world_size)
|
161 |
mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
|
162 |
nb = len(dataloader) # number of batches
|
@@ -166,7 +168,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
166 |
if rank in [-1, 0]:
|
167 |
# local_rank is set to -1. Because only the first process is expected to do evaluation.
|
168 |
testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False,
|
169 |
-
cache=opt.cache_images, rect=True,
|
170 |
|
171 |
# Model parameters
|
172 |
hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
|
@@ -199,10 +201,9 @@ def train(hyp, opt, device, tb_writer=None):
|
|
199 |
results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
|
200 |
scheduler.last_epoch = start_epoch - 1 # do not move
|
201 |
scaler = amp.GradScaler(enabled=cuda)
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
print('Starting training for %g epochs...' % epochs)
|
206 |
# torch.autograd.set_detect_anomaly(True)
|
207 |
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
|
208 |
model.train()
|
@@ -232,8 +233,8 @@ def train(hyp, opt, device, tb_writer=None):
|
|
232 |
if rank != -1:
|
233 |
dataloader.sampler.set_epoch(epoch)
|
234 |
pbar = enumerate(dataloader)
|
|
|
235 |
if rank in [-1, 0]:
|
236 |
-
print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
|
237 |
pbar = tqdm(pbar, total=nb) # progress bar
|
238 |
optimizer.zero_grad()
|
239 |
for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
|
@@ -269,7 +270,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
269 |
if rank != -1:
|
270 |
loss *= opt.world_size # gradient averaged between devices in DDP mode
|
271 |
# if not torch.isfinite(loss):
|
272 |
-
#
|
273 |
# return results
|
274 |
|
275 |
# Backward
|
@@ -369,7 +370,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
369 |
# Finish
|
370 |
if not opt.evolve:
|
371 |
plot_results(save_dir=log_dir) # save as results.png
|
372 |
-
|
373 |
|
374 |
dist.destroy_process_group() if rank not in [-1, 0] else None
|
375 |
torch.cuda.empty_cache()
|
@@ -404,13 +405,19 @@ if __name__ == '__main__':
|
|
404 |
parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
|
405 |
opt = parser.parse_args()
|
406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
# Resume
|
408 |
if opt.resume:
|
409 |
last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
|
410 |
if last and not opt.weights:
|
411 |
-
|
412 |
opt.weights = last if opt.resume and not opt.weights else opt.weights
|
413 |
-
if opt.
|
414 |
check_git_status()
|
415 |
|
416 |
opt.hyp = opt.hyp or ('data/hyp.finetune.yaml' if opt.weights else 'data/hyp.scratch.yaml')
|
@@ -419,9 +426,6 @@ if __name__ == '__main__':
|
|
419 |
|
420 |
opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
|
421 |
device = select_device(opt.device, batch_size=opt.batch_size)
|
422 |
-
opt.total_batch_size = opt.batch_size
|
423 |
-
opt.world_size = 1
|
424 |
-
opt.global_rank = -1
|
425 |
|
426 |
# DDP mode
|
427 |
if opt.local_rank != -1:
|
@@ -429,12 +433,10 @@ if __name__ == '__main__':
|
|
429 |
torch.cuda.set_device(opt.local_rank)
|
430 |
device = torch.device('cuda', opt.local_rank)
|
431 |
dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
|
432 |
-
opt.world_size = dist.get_world_size()
|
433 |
-
opt.global_rank = dist.get_rank()
|
434 |
assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
|
435 |
opt.batch_size = opt.total_batch_size // opt.world_size
|
436 |
|
437 |
-
|
438 |
with open(opt.hyp) as f:
|
439 |
hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps
|
440 |
|
@@ -442,7 +444,7 @@ if __name__ == '__main__':
|
|
442 |
if not opt.evolve:
|
443 |
tb_writer = None
|
444 |
if opt.global_rank in [-1, 0]:
|
445 |
-
|
446 |
tb_writer = SummaryWriter(log_dir=increment_dir(Path(opt.logdir) / 'exp', opt.name)) # runs/exp
|
447 |
|
448 |
train(hyp, opt, device, tb_writer)
|
|
|
3 |
import os
|
4 |
import random
|
5 |
import time
|
6 |
+
import logging
|
7 |
from pathlib import Path
|
8 |
|
9 |
import numpy as np
|
|
|
24 |
from utils.general import (
|
25 |
torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, labels_to_image_weights,
|
26 |
compute_loss, plot_images, fitness, strip_optimizer, plot_results, get_latest_run, check_dataset, check_file,
|
27 |
+
check_git_status, check_img_size, increment_dir, print_mutation, plot_evolution, set_logging)
|
28 |
from utils.google_utils import attempt_download
|
29 |
from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts
|
30 |
|
31 |
+
logger = logging.getLogger(__name__)
|
32 |
|
33 |
def train(hyp, opt, device, tb_writer=None):
|
34 |
+
logger.info(f'Hyperparameters {hyp}')
|
35 |
log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
|
36 |
wdir = str(log_dir / 'weights') + os.sep # weights directory
|
37 |
os.makedirs(wdir, exist_ok=True)
|
|
|
71 |
state_dict = ckpt['model'].float().state_dict() # to FP32
|
72 |
state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect
|
73 |
model.load_state_dict(state_dict, strict=False) # load
|
74 |
+
logging.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report
|
75 |
else:
|
76 |
model = Model(opt.cfg, ch=3, nc=nc).to(device) # create
|
77 |
|
|
|
105 |
|
106 |
optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
|
107 |
optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
|
108 |
+
logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
|
109 |
del pg0, pg1, pg2
|
110 |
|
111 |
# Scheduler https://arxiv.org/pdf/1812.01187.pdf
|
|
|
130 |
# Epochs
|
131 |
start_epoch = ckpt['epoch'] + 1
|
132 |
if epochs < start_epoch:
|
133 |
+
logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
|
134 |
(weights, ckpt['epoch'], epochs))
|
135 |
epochs += ckpt['epoch'] # finetune additional epochs
|
136 |
|
|
|
147 |
# SyncBatchNorm
|
148 |
if opt.sync_bn and cuda and rank != -1:
|
149 |
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
|
150 |
+
logger.info('Using SyncBatchNorm()')
|
151 |
|
152 |
# Exponential moving average
|
153 |
ema = ModelEMA(model) if rank in [-1, 0] else None
|
|
|
158 |
|
159 |
# Trainloader
|
160 |
dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True,
|
161 |
+
cache=opt.cache_images, rect=opt.rect, rank=rank,
|
162 |
world_size=opt.world_size)
|
163 |
mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
|
164 |
nb = len(dataloader) # number of batches
|
|
|
168 |
if rank in [-1, 0]:
|
169 |
# local_rank is set to -1. Because only the first process is expected to do evaluation.
|
170 |
testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False,
|
171 |
+
cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size)[0]
|
172 |
|
173 |
# Model parameters
|
174 |
hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
|
|
|
201 |
results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
|
202 |
scheduler.last_epoch = start_epoch - 1 # do not move
|
203 |
scaler = amp.GradScaler(enabled=cuda)
|
204 |
+
logger.info('Image sizes %g train, %g test' % (imgsz, imgsz_test))
|
205 |
+
logger.info('Using %g dataloader workers' % dataloader.num_workers)
|
206 |
+
logger.info('Starting training for %g epochs...' % epochs)
|
|
|
207 |
# torch.autograd.set_detect_anomaly(True)
|
208 |
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
|
209 |
model.train()
|
|
|
233 |
if rank != -1:
|
234 |
dataloader.sampler.set_epoch(epoch)
|
235 |
pbar = enumerate(dataloader)
|
236 |
+
logging.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
|
237 |
if rank in [-1, 0]:
|
|
|
238 |
pbar = tqdm(pbar, total=nb) # progress bar
|
239 |
optimizer.zero_grad()
|
240 |
for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
|
|
|
270 |
if rank != -1:
|
271 |
loss *= opt.world_size # gradient averaged between devices in DDP mode
|
272 |
# if not torch.isfinite(loss):
|
273 |
+
# logger.info('WARNING: non-finite loss, ending training ', loss_items)
|
274 |
# return results
|
275 |
|
276 |
# Backward
|
|
|
370 |
# Finish
|
371 |
if not opt.evolve:
|
372 |
plot_results(save_dir=log_dir) # save as results.png
|
373 |
+
logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
|
374 |
|
375 |
dist.destroy_process_group() if rank not in [-1, 0] else None
|
376 |
torch.cuda.empty_cache()
|
|
|
405 |
parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
|
406 |
opt = parser.parse_args()
|
407 |
|
408 |
+
# Set DDP variables
|
409 |
+
opt.total_batch_size = opt.batch_size
|
410 |
+
opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
|
411 |
+
opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
|
412 |
+
set_logging(opt.global_rank)
|
413 |
+
|
414 |
# Resume
|
415 |
if opt.resume:
|
416 |
last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
|
417 |
if last and not opt.weights:
|
418 |
+
logger.info(f'Resuming training from {last}')
|
419 |
opt.weights = last if opt.resume and not opt.weights else opt.weights
|
420 |
+
if opt.global_rank in [-1,0]:
|
421 |
check_git_status()
|
422 |
|
423 |
opt.hyp = opt.hyp or ('data/hyp.finetune.yaml' if opt.weights else 'data/hyp.scratch.yaml')
|
|
|
426 |
|
427 |
opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
|
428 |
device = select_device(opt.device, batch_size=opt.batch_size)
|
|
|
|
|
|
|
429 |
|
430 |
# DDP mode
|
431 |
if opt.local_rank != -1:
|
|
|
433 |
torch.cuda.set_device(opt.local_rank)
|
434 |
device = torch.device('cuda', opt.local_rank)
|
435 |
dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
|
|
|
|
|
436 |
assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
|
437 |
opt.batch_size = opt.total_batch_size // opt.world_size
|
438 |
|
439 |
+
logger.info(opt)
|
440 |
with open(opt.hyp) as f:
|
441 |
hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps
|
442 |
|
|
|
444 |
if not opt.evolve:
|
445 |
tb_writer = None
|
446 |
if opt.global_rank in [-1, 0]:
|
447 |
+
logger.info('Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir)
|
448 |
tb_writer = SummaryWriter(log_dir=increment_dir(Path(opt.logdir) / 'exp', opt.name)) # runs/exp
|
449 |
|
450 |
train(hyp, opt, device, tb_writer)
|
utils/datasets.py
CHANGED
@@ -47,9 +47,9 @@ def exif_size(img):
|
|
47 |
|
48 |
|
49 |
def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False,
|
50 |
-
|
51 |
# Make sure only the first process in DDP process the dataset first, and the following others can use the cache.
|
52 |
-
with torch_distributed_zero_first(
|
53 |
dataset = LoadImagesAndLabels(path, imgsz, batch_size,
|
54 |
augment=augment, # augment images
|
55 |
hyp=hyp, # augmentation hyperparameters
|
@@ -57,11 +57,12 @@ def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=Fa
|
|
57 |
cache_images=cache,
|
58 |
single_cls=opt.single_cls,
|
59 |
stride=int(stride),
|
60 |
-
pad=pad
|
|
|
61 |
|
62 |
batch_size = min(batch_size, len(dataset))
|
63 |
nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, 8]) # number of workers
|
64 |
-
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if
|
65 |
dataloader = torch.utils.data.DataLoader(dataset,
|
66 |
batch_size=batch_size,
|
67 |
num_workers=nw,
|
@@ -292,7 +293,7 @@ class LoadStreams: # multiple IP or RTSP cameras
|
|
292 |
|
293 |
class LoadImagesAndLabels(Dataset): # for training/testing
|
294 |
def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
|
295 |
-
cache_images=False, single_cls=False, stride=32, pad=0.0):
|
296 |
try:
|
297 |
f = [] # image files
|
298 |
for p in path if isinstance(path, list) else [path]:
|
@@ -372,8 +373,10 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
372 |
# Cache labels
|
373 |
create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False
|
374 |
nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate
|
375 |
-
pbar =
|
376 |
-
|
|
|
|
|
377 |
l = self.labels[i] # label
|
378 |
if l is not None and l.shape[0]:
|
379 |
assert l.shape[1] == 5, '> 5 label columns: %s' % file
|
@@ -420,8 +423,9 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
420 |
ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty
|
421 |
# os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove
|
422 |
|
423 |
-
|
424 |
-
|
|
|
425 |
if nf == 0:
|
426 |
s = 'WARNING: No labels found in %s. See %s' % (os.path.dirname(file) + os.sep, help_url)
|
427 |
print(s)
|
|
|
47 |
|
48 |
|
49 |
def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False,
|
50 |
+
rank=-1, world_size=1):
|
51 |
# Make sure only the first process in DDP process the dataset first, and the following others can use the cache.
|
52 |
+
with torch_distributed_zero_first(rank):
|
53 |
dataset = LoadImagesAndLabels(path, imgsz, batch_size,
|
54 |
augment=augment, # augment images
|
55 |
hyp=hyp, # augmentation hyperparameters
|
|
|
57 |
cache_images=cache,
|
58 |
single_cls=opt.single_cls,
|
59 |
stride=int(stride),
|
60 |
+
pad=pad,
|
61 |
+
rank=rank)
|
62 |
|
63 |
batch_size = min(batch_size, len(dataset))
|
64 |
nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, 8]) # number of workers
|
65 |
+
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None
|
66 |
dataloader = torch.utils.data.DataLoader(dataset,
|
67 |
batch_size=batch_size,
|
68 |
num_workers=nw,
|
|
|
293 |
|
294 |
class LoadImagesAndLabels(Dataset): # for training/testing
|
295 |
def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
|
296 |
+
cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1):
|
297 |
try:
|
298 |
f = [] # image files
|
299 |
for p in path if isinstance(path, list) else [path]:
|
|
|
373 |
# Cache labels
|
374 |
create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False
|
375 |
nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate
|
376 |
+
pbar = enumerate(self.label_files)
|
377 |
+
if rank in [-1, 0]:
|
378 |
+
pbar = tqdm(pbar)
|
379 |
+
for i, file in pbar:
|
380 |
l = self.labels[i] # label
|
381 |
if l is not None and l.shape[0]:
|
382 |
assert l.shape[1] == 5, '> 5 label columns: %s' % file
|
|
|
423 |
ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty
|
424 |
# os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove
|
425 |
|
426 |
+
if rank in [-1,0]:
|
427 |
+
pbar.desc = 'Scanning labels %s (%g found, %g missing, %g empty, %g duplicate, for %g images)' % (
|
428 |
+
cache_path, nf, nm, ne, nd, n)
|
429 |
if nf == 0:
|
430 |
s = 'WARNING: No labels found in %s. See %s' % (os.path.dirname(file) + os.sep, help_url)
|
431 |
print(s)
|
utils/general.py
CHANGED
@@ -5,6 +5,7 @@ import random
|
|
5 |
import shutil
|
6 |
import subprocess
|
7 |
import time
|
|
|
8 |
from contextlib import contextmanager
|
9 |
from copy import copy
|
10 |
from pathlib import Path
|
@@ -45,6 +46,12 @@ def torch_distributed_zero_first(local_rank: int):
|
|
45 |
torch.distributed.barrier()
|
46 |
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def init_seeds(seed=0):
|
49 |
random.seed(seed)
|
50 |
np.random.seed(seed)
|
|
|
5 |
import shutil
|
6 |
import subprocess
|
7 |
import time
|
8 |
+
import logging
|
9 |
from contextlib import contextmanager
|
10 |
from copy import copy
|
11 |
from pathlib import Path
|
|
|
46 |
torch.distributed.barrier()
|
47 |
|
48 |
|
49 |
+
def set_logging(rank=-1):
|
50 |
+
logging.basicConfig(
|
51 |
+
format="%(message)s",
|
52 |
+
level=logging.INFO if rank in [-1, 0] else logging.WARN)
|
53 |
+
|
54 |
+
|
55 |
def init_seeds(seed=0):
|
56 |
random.seed(seed)
|
57 |
np.random.seed(seed)
|
utils/torch_utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import math
|
2 |
import os
|
3 |
import time
|
|
|
4 |
from copy import deepcopy
|
5 |
|
6 |
import torch
|
@@ -9,6 +10,7 @@ import torch.nn as nn
|
|
9 |
import torch.nn.functional as F
|
10 |
import torchvision.models as models
|
11 |
|
|
|
12 |
|
13 |
def init_seeds(seed=0):
|
14 |
torch.manual_seed(seed)
|
@@ -40,12 +42,12 @@ def select_device(device='', batch_size=None):
|
|
40 |
for i in range(0, ng):
|
41 |
if i == 1:
|
42 |
s = ' ' * len(s)
|
43 |
-
|
44 |
(s, i, x[i].name, x[i].total_memory / c))
|
45 |
else:
|
46 |
-
|
47 |
|
48 |
-
|
49 |
return torch.device('cuda:0' if cuda else 'cpu')
|
50 |
|
51 |
|
@@ -142,7 +144,7 @@ def model_info(model, verbose=False):
|
|
142 |
except:
|
143 |
fs = ''
|
144 |
|
145 |
-
|
146 |
|
147 |
|
148 |
def load_classifier(name='resnet101', n=2):
|
|
|
1 |
import math
|
2 |
import os
|
3 |
import time
|
4 |
+
import logging
|
5 |
from copy import deepcopy
|
6 |
|
7 |
import torch
|
|
|
10 |
import torch.nn.functional as F
|
11 |
import torchvision.models as models
|
12 |
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
|
15 |
def init_seeds(seed=0):
|
16 |
torch.manual_seed(seed)
|
|
|
42 |
for i in range(0, ng):
|
43 |
if i == 1:
|
44 |
s = ' ' * len(s)
|
45 |
+
logger.info("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
|
46 |
(s, i, x[i].name, x[i].total_memory / c))
|
47 |
else:
|
48 |
+
logger.info('Using CPU')
|
49 |
|
50 |
+
logger.info('') # skip a line
|
51 |
return torch.device('cuda:0' if cuda else 'cpu')
|
52 |
|
53 |
|
|
|
144 |
except:
|
145 |
fs = ''
|
146 |
|
147 |
+
logger.info('Model Summary: %g layers, %g parameters, %g gradients%s' % (len(list(model.parameters())), n_p, n_g, fs))
|
148 |
|
149 |
|
150 |
def load_classifier(name='resnet101', n=2):
|