Update train.py
Browse files
train.py
CHANGED
@@ -44,11 +44,8 @@ hyp = {'optimizer': 'SGD', # ['adam', 'SGD', None] if none, default is SGD
|
|
44 |
|
45 |
|
46 |
def train(hyp):
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
#weights dir unique to each experiment
|
51 |
-
wdir = os.path.join(log_dir, 'weights') + os.sep # weights dir
|
52 |
|
53 |
os.makedirs(wdir, exist_ok=True)
|
54 |
last = wdir + 'last.pt'
|
@@ -92,8 +89,8 @@ def train(hyp):
|
|
92 |
else:
|
93 |
pg0.append(v) # all else
|
94 |
|
95 |
-
if hyp['optimizer'] =='adam':
|
96 |
-
optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) #
|
97 |
else:
|
98 |
optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
|
99 |
|
@@ -148,7 +145,7 @@ def train(hyp):
|
|
148 |
|
149 |
scheduler.last_epoch = start_epoch - 1 # do not move
|
150 |
# https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
|
151 |
-
plot_lr_scheduler(optimizer, scheduler, epochs, save_dir
|
152 |
|
153 |
# Initialize distributed training
|
154 |
if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
|
@@ -177,11 +174,10 @@ def train(hyp):
|
|
177 |
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
|
178 |
model.names = data_dict['names']
|
179 |
|
180 |
-
#
|
181 |
-
with open(
|
182 |
yaml.dump(hyp, f, sort_keys=False)
|
183 |
-
|
184 |
-
with open(os.path.join(log_dir, 'opt.yaml'), 'w') as f:
|
185 |
yaml.dump(vars(opt), f, sort_keys=False)
|
186 |
|
187 |
# Class frequency
|
@@ -189,14 +185,10 @@ def train(hyp):
|
|
189 |
c = torch.tensor(labels[:, 0]) # classes
|
190 |
# cf = torch.bincount(c.long(), minlength=nc) + 1.
|
191 |
# model._initialize_biases(cf.to(device))
|
192 |
-
|
193 |
-
#always plot labels to log_dir
|
194 |
plot_labels(labels, save_dir=log_dir)
|
195 |
-
|
196 |
if tb_writer:
|
197 |
tb_writer.add_histogram('classes', c, 0)
|
198 |
|
199 |
-
|
200 |
# Check anchors
|
201 |
if not opt.noautoanchor:
|
202 |
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
|
@@ -284,7 +276,7 @@ def train(hyp):
|
|
284 |
|
285 |
# Plot
|
286 |
if ni < 3:
|
287 |
-
f =
|
288 |
result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
|
289 |
if tb_writer and result is not None:
|
290 |
tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
|
@@ -358,7 +350,7 @@ def train(hyp):
|
|
358 |
|
359 |
# Finish
|
360 |
if not opt.evolve:
|
361 |
-
plot_results(save_dir
|
362 |
print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
|
363 |
dist.destroy_process_group() if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
|
364 |
torch.cuda.empty_cache()
|
@@ -368,14 +360,14 @@ def train(hyp):
|
|
368 |
if __name__ == '__main__':
|
369 |
check_git_status()
|
370 |
parser = argparse.ArgumentParser()
|
371 |
-
parser.add_argument('--cfg', type=str, default='models/yolov5s.yaml', help='model
|
372 |
-
parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data
|
373 |
-
parser.add_argument('--hyp', type=str, default='',help='hyp
|
374 |
parser.add_argument('--epochs', type=int, default=300)
|
375 |
parser.add_argument('--batch-size', type=int, default=16)
|
376 |
-
parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes
|
377 |
parser.add_argument('--rect', action='store_true', help='rectangular training')
|
378 |
-
parser.add_argument('--resume', nargs='?', const = 'get_last', default=False, help='resume
|
379 |
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
|
380 |
parser.add_argument('--notest', action='store_true', help='only test final epoch')
|
381 |
parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
|
@@ -387,20 +379,15 @@ if __name__ == '__main__':
|
|
387 |
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
|
388 |
parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
|
389 |
parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
|
390 |
-
|
391 |
opt = parser.parse_args()
|
392 |
-
|
393 |
-
|
394 |
-
last = get_latest_run() if opt.resume == 'get_last' else opt.resume
|
395 |
if last and not opt.weights:
|
396 |
print(f'Resuming training from {last}')
|
397 |
opt.weights = last if opt.resume and not opt.weights else opt.weights
|
398 |
-
|
399 |
-
|
400 |
opt.cfg = check_file(opt.cfg) # check file
|
401 |
opt.data = check_file(opt.data) # check file
|
402 |
-
opt.hyp = check_file(opt.hyp) if opt.hyp else '' #check file
|
403 |
-
|
404 |
print(opt)
|
405 |
opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
|
406 |
device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size)
|
@@ -410,16 +397,10 @@ if __name__ == '__main__':
|
|
410 |
# Train
|
411 |
if not opt.evolve:
|
412 |
tb_writer = SummaryWriter(comment=opt.name)
|
413 |
-
|
414 |
-
#updates hyp defaults from hyp.yaml
|
415 |
-
if opt.hyp:
|
416 |
with open(opt.hyp) as f:
|
417 |
-
|
418 |
-
hyp.update(updated_hyp)
|
419 |
|
420 |
-
# Print focal loss if gamma > 0
|
421 |
-
if hyp['fl_gamma']:
|
422 |
-
print('Using FocalLoss(gamma=%g)' % hyp['fl_gamma'])
|
423 |
print(f'Beginning training with {hyp}\n\n')
|
424 |
print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
|
425 |
|
|
|
44 |
|
45 |
|
46 |
def train(hyp):
|
47 |
+
log_dir = tb_writer.log_dir # run directory
|
48 |
+
wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory
|
|
|
|
|
|
|
49 |
|
50 |
os.makedirs(wdir, exist_ok=True)
|
51 |
last = wdir + 'last.pt'
|
|
|
89 |
else:
|
90 |
pg0.append(v) # all else
|
91 |
|
92 |
+
if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
|
93 |
+
optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
|
94 |
else:
|
95 |
optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
|
96 |
|
|
|
145 |
|
146 |
scheduler.last_epoch = start_epoch - 1 # do not move
|
147 |
# https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
|
148 |
+
plot_lr_scheduler(optimizer, scheduler, epochs, save_dir=log_dir)
|
149 |
|
150 |
# Initialize distributed training
|
151 |
if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
|
|
|
174 |
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
|
175 |
model.names = data_dict['names']
|
176 |
|
177 |
+
# Save run settings
|
178 |
+
with open(Path(log_dir) / 'hyp.yaml', 'w') as f:
|
179 |
yaml.dump(hyp, f, sort_keys=False)
|
180 |
+
with open(Path(log_dir) / 'opt.yaml', 'w') as f:
|
|
|
181 |
yaml.dump(vars(opt), f, sort_keys=False)
|
182 |
|
183 |
# Class frequency
|
|
|
185 |
c = torch.tensor(labels[:, 0]) # classes
|
186 |
# cf = torch.bincount(c.long(), minlength=nc) + 1.
|
187 |
# model._initialize_biases(cf.to(device))
|
|
|
|
|
188 |
plot_labels(labels, save_dir=log_dir)
|
|
|
189 |
if tb_writer:
|
190 |
tb_writer.add_histogram('classes', c, 0)
|
191 |
|
|
|
192 |
# Check anchors
|
193 |
if not opt.noautoanchor:
|
194 |
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
|
|
|
276 |
|
277 |
# Plot
|
278 |
if ni < 3:
|
279 |
+
f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename
|
280 |
result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
|
281 |
if tb_writer and result is not None:
|
282 |
tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
|
|
|
350 |
|
351 |
# Finish
|
352 |
if not opt.evolve:
|
353 |
+
plot_results(save_dir=log_dir) # save as results.png
|
354 |
print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
|
355 |
dist.destroy_process_group() if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
|
356 |
torch.cuda.empty_cache()
|
|
|
360 |
if __name__ == '__main__':
|
361 |
check_git_status()
|
362 |
parser = argparse.ArgumentParser()
|
363 |
+
parser.add_argument('--cfg', type=str, default='models/yolov5s.yaml', help='model.yaml path')
|
364 |
+
parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
|
365 |
+
parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
|
366 |
parser.add_argument('--epochs', type=int, default=300)
|
367 |
parser.add_argument('--batch-size', type=int, default=16)
|
368 |
+
parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
|
369 |
parser.add_argument('--rect', action='store_true', help='rectangular training')
|
370 |
+
parser.add_argument('--resume', nargs='?', const = 'get_last', default=False, help='resume from given path/to/last.pt, or most recent run if blank.')
|
371 |
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
|
372 |
parser.add_argument('--notest', action='store_true', help='only test final epoch')
|
373 |
parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
|
|
|
379 |
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
|
380 |
parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
|
381 |
parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
|
|
|
382 |
opt = parser.parse_args()
|
383 |
+
|
384 |
+
last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
|
|
|
385 |
if last and not opt.weights:
|
386 |
print(f'Resuming training from {last}')
|
387 |
opt.weights = last if opt.resume and not opt.weights else opt.weights
|
|
|
|
|
388 |
opt.cfg = check_file(opt.cfg) # check file
|
389 |
opt.data = check_file(opt.data) # check file
|
390 |
+
opt.hyp = check_file(opt.hyp) if opt.hyp else '' # check file
|
|
|
391 |
print(opt)
|
392 |
opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
|
393 |
device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size)
|
|
|
397 |
# Train
|
398 |
if not opt.evolve:
|
399 |
tb_writer = SummaryWriter(comment=opt.name)
|
400 |
+
if opt.hyp: # update hyps
|
|
|
|
|
401 |
with open(opt.hyp) as f:
|
402 |
+
hyp.update(yaml.load(f, Loader=yaml.FullLoader))
|
|
|
403 |
|
|
|
|
|
|
|
404 |
print(f'Beginning training with {hyp}\n\n')
|
405 |
print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
|
406 |
|