glenn-jocher commited on
Commit
0fef3f6
·
unverified ·
2 Parent(s): 16f6834 dc5e183

Merge pull request #104 from alexstoken/advanced_logging

Browse files

Log command line options, hyperparameters, and weights per run in `runs/`

Files changed (3) hide show
  1. test.py +7 -5
  2. train.py +43 -30
  3. utils/utils.py +13 -7
test.py CHANGED
@@ -17,7 +17,9 @@ def test(data,
17
  verbose=False,
18
  model=None,
19
  dataloader=None,
 
20
  merge=False):
 
21
  # Initialize/load model and set device
22
  training = model is not None
23
  if training: # called by train.py
@@ -28,7 +30,7 @@ def test(data,
28
  merge = opt.merge # use Merge NMS
29
 
30
  # Remove previous
31
- for f in glob.glob('test_batch*.jpg'):
32
  os.remove(f)
33
 
34
  # Load model
@@ -157,10 +159,10 @@ def test(data,
157
 
158
  # Plot images
159
  if batch_i < 1:
160
- f = 'test_batch%g_gt.jpg' % batch_i # filename
161
- plot_images(img, targets, paths, f, names) # ground truth
162
- f = 'test_batch%g_pred.jpg' % batch_i
163
- plot_images(img, output_to_target(output, width, height), paths, f, names) # predictions
164
 
165
  # Compute statistics
166
  stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
 
17
  verbose=False,
18
  model=None,
19
  dataloader=None,
20
+ save_dir='',
21
  merge=False):
22
+
23
  # Initialize/load model and set device
24
  training = model is not None
25
  if training: # called by train.py
 
30
  merge = opt.merge # use Merge NMS
31
 
32
  # Remove previous
33
+ for f in glob.glob(str(Path(save_dir) / 'test_batch*.jpg')):
34
  os.remove(f)
35
 
36
  # Load model
 
159
 
160
  # Plot images
161
  if batch_i < 1:
162
+ f = Path(save_dir) / ('test_batch%g_gt.jpg' % batch_i) # filename
163
+ plot_images(img, targets, paths, str(f), names) # ground truth
164
+ f = Path(save_dir) / ('test_batch%g_pred.jpg' % batch_i)
165
+ plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions
166
 
167
  # Compute statistics
168
  stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
train.py CHANGED
@@ -20,15 +20,11 @@ except:
20
  print('Apex recommended for faster mixed precision training: https://github.com/NVIDIA/apex')
21
  mixed_precision = False # not installed
22
 
23
- wdir = 'weights' + os.sep # weights dir
24
- os.makedirs(wdir, exist_ok=True)
25
- last = wdir + 'last.pt'
26
- best = wdir + 'best.pt'
27
- results_file = 'results.txt'
28
 
29
  # Hyperparameters
30
- hyp = {'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3)
31
- 'momentum': 0.937, # SGD momentum
 
32
  'weight_decay': 5e-4, # optimizer weight decay
33
  'giou': 0.05, # giou loss gain
34
  'cls': 0.58, # cls loss gain
@@ -45,21 +41,17 @@ hyp = {'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3)
45
  'translate': 0.0, # image translation (+/- fraction)
46
  'scale': 0.5, # image scale (+/- gain)
47
  'shear': 0.0} # image shear (+/- deg)
48
- print(hyp)
49
 
50
- # Overwrite hyp with hyp*.txt (optional)
51
- f = glob.glob('hyp*.txt')
52
- if f:
53
- print('Using %s' % f[0])
54
- for k, v in zip(hyp.keys(), np.loadtxt(f[0])):
55
- hyp[k] = v
56
 
57
- # Print focal loss if gamma > 0
58
- if hyp['fl_gamma']:
59
- print('Using FocalLoss(gamma=%g)' % hyp['fl_gamma'])
60
 
 
 
 
 
61
 
62
- def train(hyp):
63
  epochs = opt.epochs # 300
64
  batch_size = opt.batch_size # 64
65
  weights = opt.weights # initial training weights
@@ -97,8 +89,11 @@ def train(hyp):
97
  else:
98
  pg0.append(v) # all else
99
 
100
- optimizer = optim.Adam(pg0, lr=hyp['lr0']) if opt.adam else \
101
- optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
 
 
 
102
  optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
103
  optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
104
  print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
@@ -107,7 +102,7 @@ def train(hyp):
107
  # Scheduler https://arxiv.org/pdf/1812.01187.pdf
108
  lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1 # cosine
109
  scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
110
- # plot_lr_scheduler(optimizer, scheduler, epochs)
111
 
112
  # Load Model
113
  google_utils.attempt_download(weights)
@@ -176,13 +171,19 @@ def train(hyp):
176
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
177
  model.names = data_dict['names']
178
 
 
 
 
 
 
 
179
  # Class frequency
180
  labels = np.concatenate(dataset.labels, 0)
181
  c = torch.tensor(labels[:, 0]) # classes
182
  # cf = torch.bincount(c.long(), minlength=nc) + 1.
183
  # model._initialize_biases(cf.to(device))
 
184
  if tb_writer:
185
- plot_labels(labels)
186
  tb_writer.add_histogram('classes', c, 0)
187
 
188
  # Check anchors
@@ -273,7 +274,7 @@ def train(hyp):
273
 
274
  # Plot
275
  if ni < 3:
276
- f = 'train_batch%g.jpg' % ni # filename
277
  result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
278
  if tb_writer and result is not None:
279
  tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
@@ -294,7 +295,8 @@ def train(hyp):
294
  save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'),
295
  model=ema.ema,
296
  single_cls=opt.single_cls,
297
- dataloader=testloader)
 
298
 
299
  # Write
300
  with open(results_file, 'a') as f:
@@ -346,7 +348,7 @@ def train(hyp):
346
 
347
  # Finish
348
  if not opt.evolve:
349
- plot_results() # save as results.png
350
  print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
351
  dist.destroy_process_group() if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
352
  torch.cuda.empty_cache()
@@ -356,13 +358,14 @@ def train(hyp):
356
  if __name__ == '__main__':
357
  check_git_status()
358
  parser = argparse.ArgumentParser()
 
 
 
359
  parser.add_argument('--epochs', type=int, default=300)
360
  parser.add_argument('--batch-size', type=int, default=16)
361
- parser.add_argument('--cfg', type=str, default='models/yolov5s.yaml', help='*.cfg path')
362
- parser.add_argument('--data', type=str, default='data/coco128.yaml', help='*.data path')
363
  parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
364
  parser.add_argument('--rect', action='store_true', help='rectangular training')
365
- parser.add_argument('--resume', action='store_true', help='resume training from last.pt')
366
  parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
367
  parser.add_argument('--notest', action='store_true', help='only test final epoch')
368
  parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
@@ -372,13 +375,17 @@ if __name__ == '__main__':
372
  parser.add_argument('--weights', type=str, default='', help='initial weights path')
373
  parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
374
  parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
375
- parser.add_argument('--adam', action='store_true', help='use adam optimizer')
376
- parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%')
377
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
378
  opt = parser.parse_args()
 
 
 
 
379
  opt.weights = last if opt.resume and not opt.weights else opt.weights
380
  opt.cfg = check_file(opt.cfg) # check file
381
  opt.data = check_file(opt.data) # check file
 
382
  print(opt)
383
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
384
  device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size)
@@ -388,7 +395,13 @@ if __name__ == '__main__':
388
  # Train
389
  if not opt.evolve:
390
  tb_writer = SummaryWriter(comment=opt.name)
 
 
 
 
 
391
  print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
 
392
  train(hyp)
393
 
394
  # Evolve hyperparameters (optional)
 
20
  print('Apex recommended for faster mixed precision training: https://github.com/NVIDIA/apex')
21
  mixed_precision = False # not installed
22
 
 
 
 
 
 
23
 
24
  # Hyperparameters
25
+ hyp = {'optimizer': 'SGD', # ['adam', 'SGD', None] if none, default is SGD
26
+ 'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3)
27
+ 'momentum': 0.937, # SGD momentum/Adam beta1
28
  'weight_decay': 5e-4, # optimizer weight decay
29
  'giou': 0.05, # giou loss gain
30
  'cls': 0.58, # cls loss gain
 
41
  'translate': 0.0, # image translation (+/- fraction)
42
  'scale': 0.5, # image scale (+/- gain)
43
  'shear': 0.0} # image shear (+/- deg)
 
44
 
 
 
 
 
 
 
45
 
46
+ def train(hyp):
47
+ log_dir = tb_writer.log_dir # run directory
48
+ wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory
49
 
50
+ os.makedirs(wdir, exist_ok=True)
51
+ last = wdir + 'last.pt'
52
+ best = wdir + 'best.pt'
53
+ results_file = log_dir + os.sep + 'results.txt'
54
 
 
55
  epochs = opt.epochs # 300
56
  batch_size = opt.batch_size # 64
57
  weights = opt.weights # initial training weights
 
89
  else:
90
  pg0.append(v) # all else
91
 
92
+ if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
93
+ optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
94
+ else:
95
+ optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
96
+
97
  optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
98
  optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
99
  print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
 
102
  # Scheduler https://arxiv.org/pdf/1812.01187.pdf
103
  lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1 # cosine
104
  scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
105
+ plot_lr_scheduler(optimizer, scheduler, epochs, save_dir=log_dir)
106
 
107
  # Load Model
108
  google_utils.attempt_download(weights)
 
171
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
172
  model.names = data_dict['names']
173
 
174
+ # Save run settings
175
+ with open(Path(log_dir) / 'hyp.yaml', 'w') as f:
176
+ yaml.dump(hyp, f, sort_keys=False)
177
+ with open(Path(log_dir) / 'opt.yaml', 'w') as f:
178
+ yaml.dump(vars(opt), f, sort_keys=False)
179
+
180
  # Class frequency
181
  labels = np.concatenate(dataset.labels, 0)
182
  c = torch.tensor(labels[:, 0]) # classes
183
  # cf = torch.bincount(c.long(), minlength=nc) + 1.
184
  # model._initialize_biases(cf.to(device))
185
+ plot_labels(labels, save_dir=log_dir)
186
  if tb_writer:
 
187
  tb_writer.add_histogram('classes', c, 0)
188
 
189
  # Check anchors
 
274
 
275
  # Plot
276
  if ni < 3:
277
+ f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename
278
  result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
279
  if tb_writer and result is not None:
280
  tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
 
295
  save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'),
296
  model=ema.ema,
297
  single_cls=opt.single_cls,
298
+ dataloader=testloader,
299
+ save_dir=log_dir)
300
 
301
  # Write
302
  with open(results_file, 'a') as f:
 
348
 
349
  # Finish
350
  if not opt.evolve:
351
+ plot_results(save_dir=log_dir) # save as results.png
352
  print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
353
  dist.destroy_process_group() if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
354
  torch.cuda.empty_cache()
 
358
  if __name__ == '__main__':
359
  check_git_status()
360
  parser = argparse.ArgumentParser()
361
+ parser.add_argument('--cfg', type=str, default='models/yolov5s.yaml', help='model.yaml path')
362
+ parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
363
+ parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
364
  parser.add_argument('--epochs', type=int, default=300)
365
  parser.add_argument('--batch-size', type=int, default=16)
 
 
366
  parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
367
  parser.add_argument('--rect', action='store_true', help='rectangular training')
368
+ parser.add_argument('--resume', nargs='?', const = 'get_last', default=False, help='resume from given path/to/last.pt, or most recent run if blank.')
369
  parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
370
  parser.add_argument('--notest', action='store_true', help='only test final epoch')
371
  parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
 
375
  parser.add_argument('--weights', type=str, default='', help='initial weights path')
376
  parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
377
  parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
378
+ parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
 
379
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
380
  opt = parser.parse_args()
381
+
382
+ last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
383
+ if last and not opt.weights:
384
+ print(f'Resuming training from {last}')
385
  opt.weights = last if opt.resume and not opt.weights else opt.weights
386
  opt.cfg = check_file(opt.cfg) # check file
387
  opt.data = check_file(opt.data) # check file
388
+ opt.hyp = check_file(opt.hyp) if opt.hyp else '' # check file
389
  print(opt)
390
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
391
  device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size)
 
395
  # Train
396
  if not opt.evolve:
397
  tb_writer = SummaryWriter(comment=opt.name)
398
+ if opt.hyp: # update hyps
399
+ with open(opt.hyp) as f:
400
+ hyp.update(yaml.load(f, Loader=yaml.FullLoader))
401
+
402
+ print(f'Beginning training with {hyp}\n\n')
403
  print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
404
+
405
  train(hyp)
406
 
407
  # Evolve hyperparameters (optional)
utils/utils.py CHANGED
@@ -37,6 +37,12 @@ def init_seeds(seed=0):
37
  torch_utils.init_seeds(seed=seed)
38
 
39
 
 
 
 
 
 
 
40
  def check_git_status():
41
  # Suggest 'git pull' if repo is out of date
42
  if platform in ['linux', 'darwin']:
@@ -1028,7 +1034,7 @@ def plot_images(images, targets, paths=None, fname='images.jpg', names=None, max
1028
  return mosaic
1029
 
1030
 
1031
- def plot_lr_scheduler(optimizer, scheduler, epochs=300):
1032
  # Plot LR simulating training for full epochs
1033
  optimizer, scheduler = copy(optimizer), copy(scheduler) # do not modify originals
1034
  y = []
@@ -1042,7 +1048,7 @@ def plot_lr_scheduler(optimizer, scheduler, epochs=300):
1042
  plt.xlim(0, epochs)
1043
  plt.ylim(0)
1044
  plt.tight_layout()
1045
- plt.savefig('LR.png', dpi=200)
1046
 
1047
 
1048
  def plot_test_txt(): # from utils.utils import *; plot_test()
@@ -1107,7 +1113,7 @@ def plot_study_txt(f='study.txt', x=None): # from utils.utils import *; plot_st
1107
  plt.savefig(f.replace('.txt', '.png'), dpi=200)
1108
 
1109
 
1110
- def plot_labels(labels):
1111
  # plot dataset labels
1112
  c, b = labels[:, 0], labels[:, 1:].transpose() # classees, boxes
1113
 
@@ -1128,7 +1134,7 @@ def plot_labels(labels):
1128
  ax[2].scatter(b[2], b[3], c=hist2d(b[2], b[3], 90), cmap='jet')
1129
  ax[2].set_xlabel('width')
1130
  ax[2].set_ylabel('height')
1131
- plt.savefig('labels.png', dpi=200)
1132
  plt.close()
1133
 
1134
 
@@ -1174,7 +1180,7 @@ def plot_results_overlay(start=0, stop=0): # from utils.utils import *; plot_re
1174
  fig.savefig(f.replace('.txt', '.png'), dpi=200)
1175
 
1176
 
1177
- def plot_results(start=0, stop=0, bucket='', id=(), labels=()): # from utils.utils import *; plot_results()
1178
  # Plot training 'results*.txt' as seen in https://github.com/ultralytics/yolov5#reproduce-our-training
1179
  fig, ax = plt.subplots(2, 5, figsize=(12, 6))
1180
  ax = ax.ravel()
@@ -1184,7 +1190,7 @@ def plot_results(start=0, stop=0, bucket='', id=(), labels=()): # from utils.ut
1184
  os.system('rm -rf storage.googleapis.com')
1185
  files = ['https://storage.googleapis.com/%s/results%g.txt' % (bucket, x) for x in id]
1186
  else:
1187
- files = glob.glob('results*.txt') + glob.glob('../../Downloads/results*.txt')
1188
  for fi, f in enumerate(files):
1189
  try:
1190
  results = np.loadtxt(f, usecols=[2, 3, 4, 8, 9, 12, 13, 14, 10, 11], ndmin=2).T
@@ -1205,4 +1211,4 @@ def plot_results(start=0, stop=0, bucket='', id=(), labels=()): # from utils.ut
1205
 
1206
  fig.tight_layout()
1207
  ax[1].legend()
1208
- fig.savefig('results.png', dpi=200)
 
37
  torch_utils.init_seeds(seed=seed)
38
 
39
 
40
+ def get_latest_run(search_dir = './runs'):
41
+ # Return path to most recent 'last.pt' in /runs (i.e. to --resume from)
42
+ last_list = glob.glob(f'{search_dir}/**/last*.pt', recursive=True)
43
+ return max(last_list, key = os.path.getctime)
44
+
45
+
46
  def check_git_status():
47
  # Suggest 'git pull' if repo is out of date
48
  if platform in ['linux', 'darwin']:
 
1034
  return mosaic
1035
 
1036
 
1037
+ def plot_lr_scheduler(optimizer, scheduler, epochs=300, save_dir=''):
1038
  # Plot LR simulating training for full epochs
1039
  optimizer, scheduler = copy(optimizer), copy(scheduler) # do not modify originals
1040
  y = []
 
1048
  plt.xlim(0, epochs)
1049
  plt.ylim(0)
1050
  plt.tight_layout()
1051
+ plt.savefig(Path(save_dir) / 'LR.png', dpi=200)
1052
 
1053
 
1054
  def plot_test_txt(): # from utils.utils import *; plot_test()
 
1113
  plt.savefig(f.replace('.txt', '.png'), dpi=200)
1114
 
1115
 
1116
+ def plot_labels(labels, save_dir= ''):
1117
  # plot dataset labels
1118
  c, b = labels[:, 0], labels[:, 1:].transpose() # classees, boxes
1119
 
 
1134
  ax[2].scatter(b[2], b[3], c=hist2d(b[2], b[3], 90), cmap='jet')
1135
  ax[2].set_xlabel('width')
1136
  ax[2].set_ylabel('height')
1137
+ plt.savefig(Path(save_dir) / 'labels.png', dpi=200)
1138
  plt.close()
1139
 
1140
 
 
1180
  fig.savefig(f.replace('.txt', '.png'), dpi=200)
1181
 
1182
 
1183
+ def plot_results(start=0, stop=0, bucket='', id=(), labels=(), save_dir= ''): # from utils.utils import *; plot_results()
1184
  # Plot training 'results*.txt' as seen in https://github.com/ultralytics/yolov5#reproduce-our-training
1185
  fig, ax = plt.subplots(2, 5, figsize=(12, 6))
1186
  ax = ax.ravel()
 
1190
  os.system('rm -rf storage.googleapis.com')
1191
  files = ['https://storage.googleapis.com/%s/results%g.txt' % (bucket, x) for x in id]
1192
  else:
1193
+ files = glob.glob(str(Path(save_dir) / 'results*.txt')) + glob.glob('../../Downloads/results*.txt')
1194
  for fi, f in enumerate(files):
1195
  try:
1196
  results = np.loadtxt(f, usecols=[2, 3, 4, 8, 9, 12, 13, 14, 10, 11], ndmin=2).T
 
1211
 
1212
  fig.tight_layout()
1213
  ax[1].legend()
1214
+ fig.savefig(Path(save_dir) / 'results.png', dpi=200)