glenn-jocher commited on
Commit
6b134d9
·
unverified ·
1 Parent(s): 22ab1c2

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +20 -39
train.py CHANGED
@@ -44,11 +44,8 @@ hyp = {'optimizer': 'SGD', # ['adam', 'SGD', None] if none, default is SGD
44
 
45
 
46
  def train(hyp):
47
- #write all results to the tb log_dir, so all data from one run is together
48
- log_dir = tb_writer.log_dir
49
-
50
- #weights dir unique to each experiment
51
- wdir = os.path.join(log_dir, 'weights') + os.sep # weights dir
52
 
53
  os.makedirs(wdir, exist_ok=True)
54
  last = wdir + 'last.pt'
@@ -92,8 +89,8 @@ def train(hyp):
92
  else:
93
  pg0.append(v) # all else
94
 
95
- if hyp['optimizer'] =='adam':
96
- optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) #use default beta2, adjust beta1 for Adam momentum per momentum adjustments in https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
97
  else:
98
  optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
99
 
@@ -148,7 +145,7 @@ def train(hyp):
148
 
149
  scheduler.last_epoch = start_epoch - 1 # do not move
150
  # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
151
- plot_lr_scheduler(optimizer, scheduler, epochs, save_dir = log_dir)
152
 
153
  # Initialize distributed training
154
  if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
@@ -177,11 +174,10 @@ def train(hyp):
177
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
178
  model.names = data_dict['names']
179
 
180
- #save hyperparamter and training options in run folder
181
- with open(os.path.join(log_dir, 'hyp.yaml'), 'w') as f:
182
  yaml.dump(hyp, f, sort_keys=False)
183
-
184
- with open(os.path.join(log_dir, 'opt.yaml'), 'w') as f:
185
  yaml.dump(vars(opt), f, sort_keys=False)
186
 
187
  # Class frequency
@@ -189,14 +185,10 @@ def train(hyp):
189
  c = torch.tensor(labels[:, 0]) # classes
190
  # cf = torch.bincount(c.long(), minlength=nc) + 1.
191
  # model._initialize_biases(cf.to(device))
192
-
193
- #always plot labels to log_dir
194
  plot_labels(labels, save_dir=log_dir)
195
-
196
  if tb_writer:
197
  tb_writer.add_histogram('classes', c, 0)
198
 
199
-
200
  # Check anchors
201
  if not opt.noautoanchor:
202
  check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
@@ -284,7 +276,7 @@ def train(hyp):
284
 
285
  # Plot
286
  if ni < 3:
287
- f = os.path.join(log_dir, 'train_batch%g.jpg' % ni) # filename
288
  result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
289
  if tb_writer and result is not None:
290
  tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
@@ -358,7 +350,7 @@ def train(hyp):
358
 
359
  # Finish
360
  if not opt.evolve:
361
- plot_results(save_dir = log_dir) # save as results.png
362
  print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
363
  dist.destroy_process_group() if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
364
  torch.cuda.empty_cache()
@@ -368,14 +360,14 @@ def train(hyp):
368
  if __name__ == '__main__':
369
  check_git_status()
370
  parser = argparse.ArgumentParser()
371
- parser.add_argument('--cfg', type=str, default='models/yolov5s.yaml', help='model cfg path[*.yaml]')
372
- parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data cfg path [*.yaml]')
373
- parser.add_argument('--hyp', type=str, default='',help='hyp cfg path [*.yaml].')
374
  parser.add_argument('--epochs', type=int, default=300)
375
  parser.add_argument('--batch-size', type=int, default=16)
376
- parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes. Assumes square imgs.')
377
  parser.add_argument('--rect', action='store_true', help='rectangular training')
378
- parser.add_argument('--resume', nargs='?', const = 'get_last', default=False, help='resume training from given path/to/last.pt, or most recent run if blank.')
379
  parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
380
  parser.add_argument('--notest', action='store_true', help='only test final epoch')
381
  parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
@@ -387,20 +379,15 @@ if __name__ == '__main__':
387
  parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
388
  parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
389
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
390
-
391
  opt = parser.parse_args()
392
-
393
- # use given path/to/last.pt or find most recent run if no path given
394
- last = get_latest_run() if opt.resume == 'get_last' else opt.resume
395
  if last and not opt.weights:
396
  print(f'Resuming training from {last}')
397
  opt.weights = last if opt.resume and not opt.weights else opt.weights
398
-
399
-
400
  opt.cfg = check_file(opt.cfg) # check file
401
  opt.data = check_file(opt.data) # check file
402
- opt.hyp = check_file(opt.hyp) if opt.hyp else '' #check file
403
-
404
  print(opt)
405
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
406
  device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size)
@@ -410,16 +397,10 @@ if __name__ == '__main__':
410
  # Train
411
  if not opt.evolve:
412
  tb_writer = SummaryWriter(comment=opt.name)
413
-
414
- #updates hyp defaults from hyp.yaml
415
- if opt.hyp:
416
  with open(opt.hyp) as f:
417
- updated_hyp = yaml.load(f, Loader=yaml.FullLoader)
418
- hyp.update(updated_hyp)
419
 
420
- # Print focal loss if gamma > 0
421
- if hyp['fl_gamma']:
422
- print('Using FocalLoss(gamma=%g)' % hyp['fl_gamma'])
423
  print(f'Beginning training with {hyp}\n\n')
424
  print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
425
 
 
44
 
45
 
46
  def train(hyp):
47
+ log_dir = tb_writer.log_dir # run directory
48
+ wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory
 
 
 
49
 
50
  os.makedirs(wdir, exist_ok=True)
51
  last = wdir + 'last.pt'
 
89
  else:
90
  pg0.append(v) # all else
91
 
92
+ if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
93
+ optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
94
  else:
95
  optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
96
 
 
145
 
146
  scheduler.last_epoch = start_epoch - 1 # do not move
147
  # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
148
+ plot_lr_scheduler(optimizer, scheduler, epochs, save_dir=log_dir)
149
 
150
  # Initialize distributed training
151
  if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
 
174
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
175
  model.names = data_dict['names']
176
 
177
+ # Save run settings
178
+ with open(Path(log_dir) / 'hyp.yaml', 'w') as f:
179
  yaml.dump(hyp, f, sort_keys=False)
180
+ with open(Path(log_dir) / 'opt.yaml', 'w') as f:
 
181
  yaml.dump(vars(opt), f, sort_keys=False)
182
 
183
  # Class frequency
 
185
  c = torch.tensor(labels[:, 0]) # classes
186
  # cf = torch.bincount(c.long(), minlength=nc) + 1.
187
  # model._initialize_biases(cf.to(device))
 
 
188
  plot_labels(labels, save_dir=log_dir)
 
189
  if tb_writer:
190
  tb_writer.add_histogram('classes', c, 0)
191
 
 
192
  # Check anchors
193
  if not opt.noautoanchor:
194
  check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
 
276
 
277
  # Plot
278
  if ni < 3:
279
+ f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename
280
  result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
281
  if tb_writer and result is not None:
282
  tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
 
350
 
351
  # Finish
352
  if not opt.evolve:
353
+ plot_results(save_dir=log_dir) # save as results.png
354
  print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
355
  dist.destroy_process_group() if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
356
  torch.cuda.empty_cache()
 
360
  if __name__ == '__main__':
361
  check_git_status()
362
  parser = argparse.ArgumentParser()
363
+ parser.add_argument('--cfg', type=str, default='models/yolov5s.yaml', help='model.yaml path')
364
+ parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
365
+ parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
366
  parser.add_argument('--epochs', type=int, default=300)
367
  parser.add_argument('--batch-size', type=int, default=16)
368
+ parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
369
  parser.add_argument('--rect', action='store_true', help='rectangular training')
370
+ parser.add_argument('--resume', nargs='?', const = 'get_last', default=False, help='resume from given path/to/last.pt, or most recent run if blank.')
371
  parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
372
  parser.add_argument('--notest', action='store_true', help='only test final epoch')
373
  parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
 
379
  parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
380
  parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
381
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
 
382
  opt = parser.parse_args()
383
+
384
+ last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
 
385
  if last and not opt.weights:
386
  print(f'Resuming training from {last}')
387
  opt.weights = last if opt.resume and not opt.weights else opt.weights
 
 
388
  opt.cfg = check_file(opt.cfg) # check file
389
  opt.data = check_file(opt.data) # check file
390
+ opt.hyp = check_file(opt.hyp) if opt.hyp else '' # check file
 
391
  print(opt)
392
  opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
393
  device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size)
 
397
  # Train
398
  if not opt.evolve:
399
  tb_writer = SummaryWriter(comment=opt.name)
400
+ if opt.hyp: # update hyps
 
 
401
  with open(opt.hyp) as f:
402
+ hyp.update(yaml.load(f, Loader=yaml.FullLoader))
 
403
 
 
 
 
404
  print(f'Beginning training with {hyp}\n\n')
405
  print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
406