Nanobit glenn-jocher commited on
Commit
4949401
·
unverified ·
1 Parent(s): 542833c

Fix redundant outputs via Logging in DDP training (#500)

Browse files

* Change print to logging

* Clean function set_logging

* Add line spacing

* Change leftover prints to log

* Fix scanning labels output

* Fix rank naming

* Change leftover print to logging

* Reorganized DDP variables

* Fix type error

* Make quotes consistent

* Fix spelling

* Clean function call

* Add line spacing

* Update datasets.py

* Update train.py

Co-authored-by: Glenn Jocher <[email protected]>

Files changed (5) hide show
  1. models/yolo.py +4 -2
  2. train.py +26 -24
  3. utils/datasets.py +13 -9
  4. utils/general.py +7 -0
  5. utils/torch_utils.py +6 -4
models/yolo.py CHANGED
@@ -1,5 +1,6 @@
1
  import argparse
2
  import math
 
3
  from copy import deepcopy
4
  from pathlib import Path
5
 
@@ -12,6 +13,7 @@ from utils.general import check_anchor_order, make_divisible, check_file
12
  from utils.torch_utils import (
13
  time_synchronized, fuse_conv_and_bn, model_info, scale_img, initialize_weights, select_device)
14
 
 
15
 
16
  class Detect(nn.Module):
17
  def __init__(self, nc=80, anchors=(), ch=()): # detection layer
@@ -169,7 +171,7 @@ class Model(nn.Module):
169
 
170
 
171
  def parse_model(d, ch): # model_dict, input_channels(3)
172
- print('\n%3s%18s%3s%10s %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
173
  anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
174
  na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors
175
  no = na * (nc + 5) # number of outputs = anchors * (classes + 5)
@@ -224,7 +226,7 @@ def parse_model(d, ch): # model_dict, input_channels(3)
224
  t = str(m)[8:-2].replace('__main__.', '') # module type
225
  np = sum([x.numel() for x in m_.parameters()]) # number params
226
  m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
227
- print('%3s%18s%3s%10.0f %-40s%-30s' % (i, f, n, np, t, args)) # print
228
  save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
229
  layers.append(m_)
230
  ch.append(c2)
 
1
  import argparse
2
  import math
3
+ import logging
4
  from copy import deepcopy
5
  from pathlib import Path
6
 
 
13
  from utils.torch_utils import (
14
  time_synchronized, fuse_conv_and_bn, model_info, scale_img, initialize_weights, select_device)
15
 
16
+ logger = logging.getLogger(__name__)
17
 
18
  class Detect(nn.Module):
19
  def __init__(self, nc=80, anchors=(), ch=()): # detection layer
 
171
 
172
 
173
  def parse_model(d, ch): # model_dict, input_channels(3)
174
+ logger.info('\n%3s%18s%3s%10s %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
175
  anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
176
  na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors
177
  no = na * (nc + 5) # number of outputs = anchors * (classes + 5)
 
226
  t = str(m)[8:-2].replace('__main__.', '') # module type
227
  np = sum([x.numel() for x in m_.parameters()]) # number params
228
  m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
229
+ logger.info('%3s%18s%3s%10.0f %-40s%-30s' % (i, f, n, np, t, args)) # print
230
  save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
231
  layers.append(m_)
232
  ch.append(c2)
train.py CHANGED
@@ -3,6 +3,7 @@ import math
3
  import os
4
  import random
5
  import time
 
6
  from pathlib import Path
7
 
8
  import numpy as np
@@ -23,13 +24,14 @@ from utils.datasets import create_dataloader
23
  from utils.general import (
24
  torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, labels_to_image_weights,
25
  compute_loss, plot_images, fitness, strip_optimizer, plot_results, get_latest_run, check_dataset, check_file,
26
- check_git_status, check_img_size, increment_dir, print_mutation, plot_evolution)
27
  from utils.google_utils import attempt_download
28
  from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts
29
 
 
30
 
31
  def train(hyp, opt, device, tb_writer=None):
32
- print(f'Hyperparameters {hyp}')
33
  log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
34
  wdir = str(log_dir / 'weights') + os.sep # weights directory
35
  os.makedirs(wdir, exist_ok=True)
@@ -69,7 +71,7 @@ def train(hyp, opt, device, tb_writer=None):
69
  state_dict = ckpt['model'].float().state_dict() # to FP32
70
  state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect
71
  model.load_state_dict(state_dict, strict=False) # load
72
- print('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report
73
  else:
74
  model = Model(opt.cfg, ch=3, nc=nc).to(device) # create
75
 
@@ -103,7 +105,7 @@ def train(hyp, opt, device, tb_writer=None):
103
 
104
  optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
105
  optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
106
- print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
107
  del pg0, pg1, pg2
108
 
109
  # Scheduler https://arxiv.org/pdf/1812.01187.pdf
@@ -128,7 +130,7 @@ def train(hyp, opt, device, tb_writer=None):
128
  # Epochs
129
  start_epoch = ckpt['epoch'] + 1
130
  if epochs < start_epoch:
131
- print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
132
  (weights, ckpt['epoch'], epochs))
133
  epochs += ckpt['epoch'] # finetune additional epochs
134
 
@@ -145,7 +147,7 @@ def train(hyp, opt, device, tb_writer=None):
145
  # SyncBatchNorm
146
  if opt.sync_bn and cuda and rank != -1:
147
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
148
- print('Using SyncBatchNorm()')
149
 
150
  # Exponential moving average
151
  ema = ModelEMA(model) if rank in [-1, 0] else None
@@ -156,7 +158,7 @@ def train(hyp, opt, device, tb_writer=None):
156
 
157
  # Trainloader
158
  dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True,
159
- cache=opt.cache_images, rect=opt.rect, local_rank=rank,
160
  world_size=opt.world_size)
161
  mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
162
  nb = len(dataloader) # number of batches
@@ -166,7 +168,7 @@ def train(hyp, opt, device, tb_writer=None):
166
  if rank in [-1, 0]:
167
  # local_rank is set to -1. Because only the first process is expected to do evaluation.
168
  testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False,
169
- cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0]
170
 
171
  # Model parameters
172
  hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
@@ -199,10 +201,9 @@ def train(hyp, opt, device, tb_writer=None):
199
  results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
200
  scheduler.last_epoch = start_epoch - 1 # do not move
201
  scaler = amp.GradScaler(enabled=cuda)
202
- if rank in [0, -1]:
203
- print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
204
- print('Using %g dataloader workers' % dataloader.num_workers)
205
- print('Starting training for %g epochs...' % epochs)
206
  # torch.autograd.set_detect_anomaly(True)
207
  for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
208
  model.train()
@@ -232,8 +233,8 @@ def train(hyp, opt, device, tb_writer=None):
232
  if rank != -1:
233
  dataloader.sampler.set_epoch(epoch)
234
  pbar = enumerate(dataloader)
 
235
  if rank in [-1, 0]:
236
- print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
237
  pbar = tqdm(pbar, total=nb) # progress bar
238
  optimizer.zero_grad()
239
  for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
@@ -269,7 +270,7 @@ def train(hyp, opt, device, tb_writer=None):
269
  if rank != -1:
270
  loss *= opt.world_size # gradient averaged between devices in DDP mode
271
  # if not torch.isfinite(loss):
272
- # print('WARNING: non-finite loss, ending training ', loss_items)
273
  # return results
274
 
275
  # Backward
@@ -369,7 +370,7 @@ def train(hyp, opt, device, tb_writer=None):
369
  # Finish
370
  if not opt.evolve:
371
  plot_results(save_dir=log_dir) # save as results.png
372
- print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
373
 
374
  dist.destroy_process_group() if rank not in [-1, 0] else None
375
  torch.cuda.empty_cache()
@@ -404,13 +405,19 @@ if __name__ == '__main__':
404
  parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
405
  opt = parser.parse_args()
406
 
 
 
 
 
 
 
407
  # Resume
408
  if opt.resume:
409
  last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
410
  if last and not opt.weights:
411
- print(f'Resuming training from {last}')
412
  opt.weights = last if opt.resume and not opt.weights else opt.weights
413
- if opt.local_rank == -1 or ("RANK" in os.environ and os.environ["RANK"] == "0"):
414
  check_git_status()
415
 
416
  opt.hyp = opt.hyp or ('data/hyp.finetune.yaml' if opt.weights else 'data/hyp.scratch.yaml')
@@ -419,9 +426,6 @@ if __name__ == '__main__':
419
 
420
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
421
  device = select_device(opt.device, batch_size=opt.batch_size)
422
- opt.total_batch_size = opt.batch_size
423
- opt.world_size = 1
424
- opt.global_rank = -1
425
 
426
  # DDP mode
427
  if opt.local_rank != -1:
@@ -429,12 +433,10 @@ if __name__ == '__main__':
429
  torch.cuda.set_device(opt.local_rank)
430
  device = torch.device('cuda', opt.local_rank)
431
  dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
432
- opt.world_size = dist.get_world_size()
433
- opt.global_rank = dist.get_rank()
434
  assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
435
  opt.batch_size = opt.total_batch_size // opt.world_size
436
 
437
- print(opt)
438
  with open(opt.hyp) as f:
439
  hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps
440
 
@@ -442,7 +444,7 @@ if __name__ == '__main__':
442
  if not opt.evolve:
443
  tb_writer = None
444
  if opt.global_rank in [-1, 0]:
445
- print('Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir)
446
  tb_writer = SummaryWriter(log_dir=increment_dir(Path(opt.logdir) / 'exp', opt.name)) # runs/exp
447
 
448
  train(hyp, opt, device, tb_writer)
 
3
  import os
4
  import random
5
  import time
6
+ import logging
7
  from pathlib import Path
8
 
9
  import numpy as np
 
24
  from utils.general import (
25
  torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, labels_to_image_weights,
26
  compute_loss, plot_images, fitness, strip_optimizer, plot_results, get_latest_run, check_dataset, check_file,
27
+ check_git_status, check_img_size, increment_dir, print_mutation, plot_evolution, set_logging)
28
  from utils.google_utils import attempt_download
29
  from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts
30
 
31
+ logger = logging.getLogger(__name__)
32
 
33
  def train(hyp, opt, device, tb_writer=None):
34
+ logger.info(f'Hyperparameters {hyp}')
35
  log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
36
  wdir = str(log_dir / 'weights') + os.sep # weights directory
37
  os.makedirs(wdir, exist_ok=True)
 
71
  state_dict = ckpt['model'].float().state_dict() # to FP32
72
  state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect
73
  model.load_state_dict(state_dict, strict=False) # load
74
+ logging.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report
75
  else:
76
  model = Model(opt.cfg, ch=3, nc=nc).to(device) # create
77
 
 
105
 
106
  optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
107
  optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
108
+ logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
109
  del pg0, pg1, pg2
110
 
111
  # Scheduler https://arxiv.org/pdf/1812.01187.pdf
 
130
  # Epochs
131
  start_epoch = ckpt['epoch'] + 1
132
  if epochs < start_epoch:
133
+ logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
134
  (weights, ckpt['epoch'], epochs))
135
  epochs += ckpt['epoch'] # finetune additional epochs
136
 
 
147
  # SyncBatchNorm
148
  if opt.sync_bn and cuda and rank != -1:
149
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
150
+ logger.info('Using SyncBatchNorm()')
151
 
152
  # Exponential moving average
153
  ema = ModelEMA(model) if rank in [-1, 0] else None
 
158
 
159
  # Trainloader
160
  dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True,
161
+ cache=opt.cache_images, rect=opt.rect, rank=rank,
162
  world_size=opt.world_size)
163
  mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
164
  nb = len(dataloader) # number of batches
 
168
  if rank in [-1, 0]:
169
  # local_rank is set to -1. Because only the first process is expected to do evaluation.
170
  testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False,
171
+ cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size)[0]
172
 
173
  # Model parameters
174
  hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
 
201
  results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
202
  scheduler.last_epoch = start_epoch - 1 # do not move
203
  scaler = amp.GradScaler(enabled=cuda)
204
+ logger.info('Image sizes %g train, %g test' % (imgsz, imgsz_test))
205
+ logger.info('Using %g dataloader workers' % dataloader.num_workers)
206
+ logger.info('Starting training for %g epochs...' % epochs)
 
207
  # torch.autograd.set_detect_anomaly(True)
208
  for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
209
  model.train()
 
233
  if rank != -1:
234
  dataloader.sampler.set_epoch(epoch)
235
  pbar = enumerate(dataloader)
236
+ logging.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
237
  if rank in [-1, 0]:
 
238
  pbar = tqdm(pbar, total=nb) # progress bar
239
  optimizer.zero_grad()
240
  for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
 
270
  if rank != -1:
271
  loss *= opt.world_size # gradient averaged between devices in DDP mode
272
  # if not torch.isfinite(loss):
273
+ # logger.info('WARNING: non-finite loss, ending training ', loss_items)
274
  # return results
275
 
276
  # Backward
 
370
  # Finish
371
  if not opt.evolve:
372
  plot_results(save_dir=log_dir) # save as results.png
373
+ logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
374
 
375
  dist.destroy_process_group() if rank not in [-1, 0] else None
376
  torch.cuda.empty_cache()
 
405
  parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
406
  opt = parser.parse_args()
407
 
408
+ # Set DDP variables
409
+ opt.total_batch_size = opt.batch_size
410
+ opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
411
+ opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
412
+ set_logging(opt.global_rank)
413
+
414
  # Resume
415
  if opt.resume:
416
  last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
417
  if last and not opt.weights:
418
+ logger.info(f'Resuming training from {last}')
419
  opt.weights = last if opt.resume and not opt.weights else opt.weights
420
+ if opt.global_rank in [-1,0]:
421
  check_git_status()
422
 
423
  opt.hyp = opt.hyp or ('data/hyp.finetune.yaml' if opt.weights else 'data/hyp.scratch.yaml')
 
426
 
427
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
428
  device = select_device(opt.device, batch_size=opt.batch_size)
 
 
 
429
 
430
  # DDP mode
431
  if opt.local_rank != -1:
 
433
  torch.cuda.set_device(opt.local_rank)
434
  device = torch.device('cuda', opt.local_rank)
435
  dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
 
 
436
  assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
437
  opt.batch_size = opt.total_batch_size // opt.world_size
438
 
439
+ logger.info(opt)
440
  with open(opt.hyp) as f:
441
  hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps
442
 
 
444
  if not opt.evolve:
445
  tb_writer = None
446
  if opt.global_rank in [-1, 0]:
447
+ logger.info('Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir)
448
  tb_writer = SummaryWriter(log_dir=increment_dir(Path(opt.logdir) / 'exp', opt.name)) # runs/exp
449
 
450
  train(hyp, opt, device, tb_writer)
utils/datasets.py CHANGED
@@ -47,9 +47,9 @@ def exif_size(img):
47
 
48
 
49
  def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False,
50
- local_rank=-1, world_size=1):
51
  # Make sure only the first process in DDP process the dataset first, and the following others can use the cache.
52
- with torch_distributed_zero_first(local_rank):
53
  dataset = LoadImagesAndLabels(path, imgsz, batch_size,
54
  augment=augment, # augment images
55
  hyp=hyp, # augmentation hyperparameters
@@ -57,11 +57,12 @@ def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=Fa
57
  cache_images=cache,
58
  single_cls=opt.single_cls,
59
  stride=int(stride),
60
- pad=pad)
 
61
 
62
  batch_size = min(batch_size, len(dataset))
63
  nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, 8]) # number of workers
64
- train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if local_rank != -1 else None
65
  dataloader = torch.utils.data.DataLoader(dataset,
66
  batch_size=batch_size,
67
  num_workers=nw,
@@ -292,7 +293,7 @@ class LoadStreams: # multiple IP or RTSP cameras
292
 
293
  class LoadImagesAndLabels(Dataset): # for training/testing
294
  def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
295
- cache_images=False, single_cls=False, stride=32, pad=0.0):
296
  try:
297
  f = [] # image files
298
  for p in path if isinstance(path, list) else [path]:
@@ -372,8 +373,10 @@ class LoadImagesAndLabels(Dataset): # for training/testing
372
  # Cache labels
373
  create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False
374
  nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate
375
- pbar = tqdm(self.label_files)
376
- for i, file in enumerate(pbar):
 
 
377
  l = self.labels[i] # label
378
  if l is not None and l.shape[0]:
379
  assert l.shape[1] == 5, '> 5 label columns: %s' % file
@@ -420,8 +423,9 @@ class LoadImagesAndLabels(Dataset): # for training/testing
420
  ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty
421
  # os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove
422
 
423
- pbar.desc = 'Scanning labels %s (%g found, %g missing, %g empty, %g duplicate, for %g images)' % (
424
- cache_path, nf, nm, ne, nd, n)
 
425
  if nf == 0:
426
  s = 'WARNING: No labels found in %s. See %s' % (os.path.dirname(file) + os.sep, help_url)
427
  print(s)
 
47
 
48
 
49
  def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False,
50
+ rank=-1, world_size=1):
51
  # Make sure only the first process in DDP process the dataset first, and the following others can use the cache.
52
+ with torch_distributed_zero_first(rank):
53
  dataset = LoadImagesAndLabels(path, imgsz, batch_size,
54
  augment=augment, # augment images
55
  hyp=hyp, # augmentation hyperparameters
 
57
  cache_images=cache,
58
  single_cls=opt.single_cls,
59
  stride=int(stride),
60
+ pad=pad,
61
+ rank=rank)
62
 
63
  batch_size = min(batch_size, len(dataset))
64
  nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, 8]) # number of workers
65
+ train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None
66
  dataloader = torch.utils.data.DataLoader(dataset,
67
  batch_size=batch_size,
68
  num_workers=nw,
 
293
 
294
  class LoadImagesAndLabels(Dataset): # for training/testing
295
  def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
296
+ cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1):
297
  try:
298
  f = [] # image files
299
  for p in path if isinstance(path, list) else [path]:
 
373
  # Cache labels
374
  create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False
375
  nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate
376
+ pbar = enumerate(self.label_files)
377
+ if rank in [-1, 0]:
378
+ pbar = tqdm(pbar)
379
+ for i, file in pbar:
380
  l = self.labels[i] # label
381
  if l is not None and l.shape[0]:
382
  assert l.shape[1] == 5, '> 5 label columns: %s' % file
 
423
  ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty
424
  # os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove
425
 
426
+ if rank in [-1,0]:
427
+ pbar.desc = 'Scanning labels %s (%g found, %g missing, %g empty, %g duplicate, for %g images)' % (
428
+ cache_path, nf, nm, ne, nd, n)
429
  if nf == 0:
430
  s = 'WARNING: No labels found in %s. See %s' % (os.path.dirname(file) + os.sep, help_url)
431
  print(s)
utils/general.py CHANGED
@@ -5,6 +5,7 @@ import random
5
  import shutil
6
  import subprocess
7
  import time
 
8
  from contextlib import contextmanager
9
  from copy import copy
10
  from pathlib import Path
@@ -45,6 +46,12 @@ def torch_distributed_zero_first(local_rank: int):
45
  torch.distributed.barrier()
46
 
47
 
 
 
 
 
 
 
48
  def init_seeds(seed=0):
49
  random.seed(seed)
50
  np.random.seed(seed)
 
5
  import shutil
6
  import subprocess
7
  import time
8
+ import logging
9
  from contextlib import contextmanager
10
  from copy import copy
11
  from pathlib import Path
 
46
  torch.distributed.barrier()
47
 
48
 
49
+ def set_logging(rank=-1):
50
+ logging.basicConfig(
51
+ format="%(message)s",
52
+ level=logging.INFO if rank in [-1, 0] else logging.WARN)
53
+
54
+
55
  def init_seeds(seed=0):
56
  random.seed(seed)
57
  np.random.seed(seed)
utils/torch_utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import math
2
  import os
3
  import time
 
4
  from copy import deepcopy
5
 
6
  import torch
@@ -9,6 +10,7 @@ import torch.nn as nn
9
  import torch.nn.functional as F
10
  import torchvision.models as models
11
 
 
12
 
13
  def init_seeds(seed=0):
14
  torch.manual_seed(seed)
@@ -40,12 +42,12 @@ def select_device(device='', batch_size=None):
40
  for i in range(0, ng):
41
  if i == 1:
42
  s = ' ' * len(s)
43
- print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
44
  (s, i, x[i].name, x[i].total_memory / c))
45
  else:
46
- print('Using CPU')
47
 
48
- print('') # skip a line
49
  return torch.device('cuda:0' if cuda else 'cpu')
50
 
51
 
@@ -142,7 +144,7 @@ def model_info(model, verbose=False):
142
  except:
143
  fs = ''
144
 
145
- print('Model Summary: %g layers, %g parameters, %g gradients%s' % (len(list(model.parameters())), n_p, n_g, fs))
146
 
147
 
148
  def load_classifier(name='resnet101', n=2):
 
1
  import math
2
  import os
3
  import time
4
+ import logging
5
  from copy import deepcopy
6
 
7
  import torch
 
10
  import torch.nn.functional as F
11
  import torchvision.models as models
12
 
13
+ logger = logging.getLogger(__name__)
14
 
15
  def init_seeds(seed=0):
16
  torch.manual_seed(seed)
 
42
  for i in range(0, ng):
43
  if i == 1:
44
  s = ' ' * len(s)
45
+ logger.info("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
46
  (s, i, x[i].name, x[i].total_memory / c))
47
  else:
48
+ logger.info('Using CPU')
49
 
50
+ logger.info('') # skip a line
51
  return torch.device('cuda:0' if cuda else 'cpu')
52
 
53
 
 
144
  except:
145
  fs = ''
146
 
147
+ logger.info('Model Summary: %g layers, %g parameters, %g gradients%s' % (len(list(model.parameters())), n_p, n_g, fs))
148
 
149
 
150
  def load_classifier(name='resnet101', n=2):