Feng Wang commited on
Commit
53d1ae9
·
1 Parent(s): 7020e34

fix(core): avoid overwrite best ckpt and add attr description (#1103)

Browse files
yolox/core/trainer.py CHANGED
@@ -46,6 +46,7 @@ class Trainer:
46
  self.local_rank = get_local_rank()
47
  self.device = "cuda:{}".format(self.local_rank)
48
  self.use_model_ema = exp.ema
 
49
 
50
  # data/dataloader related attr
51
  self.data_type = torch.float16 if args.fp16 else torch.float32
@@ -174,7 +175,7 @@ class Trainer:
174
  )
175
  # Tensorboard logger
176
  if self.rank == 0:
177
- self.tblogger = SummaryWriter(self.file_name)
178
 
179
  logger.info("Training start...")
180
  logger.info("\n{}".format(model))
@@ -269,6 +270,7 @@ class Trainer:
269
  # resume the model/optimizer state dict
270
  model.load_state_dict(ckpt["model"])
271
  self.optimizer.load_state_dict(ckpt["optimizer"])
 
272
  # resume the training states variables
273
  start_epoch = (
274
  self.args.start_epoch - 1
@@ -302,6 +304,9 @@ class Trainer:
302
  ap50_95, ap50, summary = self.exp.eval(
303
  evalmodel, self.evaluator, self.is_distributed
304
  )
 
 
 
305
  self.model.train()
306
  if self.rank == 0:
307
  self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)
@@ -309,8 +314,9 @@ class Trainer:
309
  logger.info("\n" + summary)
310
  synchronize()
311
 
312
- self.save_ckpt("last_epoch", ap50_95 > self.best_ap)
313
- self.best_ap = max(self.best_ap, ap50_95)
 
314
 
315
  def save_ckpt(self, ckpt_name, update_best_ckpt=False):
316
  if self.rank == 0:
@@ -320,6 +326,7 @@ class Trainer:
320
  "start_epoch": self.epoch + 1,
321
  "model": save_model.state_dict(),
322
  "optimizer": self.optimizer.state_dict(),
 
323
  }
324
  save_checkpoint(
325
  ckpt_state,
 
46
  self.local_rank = get_local_rank()
47
  self.device = "cuda:{}".format(self.local_rank)
48
  self.use_model_ema = exp.ema
49
+ self.save_history_ckpt = exp.save_history_ckpt
50
 
51
  # data/dataloader related attr
52
  self.data_type = torch.float16 if args.fp16 else torch.float32
 
175
  )
176
  # Tensorboard logger
177
  if self.rank == 0:
178
+ self.tblogger = SummaryWriter(os.path.join(self.file_name, "tensorboard"))
179
 
180
  logger.info("Training start...")
181
  logger.info("\n{}".format(model))
 
270
  # resume the model/optimizer state dict
271
  model.load_state_dict(ckpt["model"])
272
  self.optimizer.load_state_dict(ckpt["optimizer"])
273
+ self.best_ap = ckpt.pop("best_ap", 0)
274
  # resume the training states variables
275
  start_epoch = (
276
  self.args.start_epoch - 1
 
304
  ap50_95, ap50, summary = self.exp.eval(
305
  evalmodel, self.evaluator, self.is_distributed
306
  )
307
+ update_best_ckpt = ap50_95 > self.best_ap
308
+ self.best_ap = max(self.best_ap, ap50_95)
309
+
310
  self.model.train()
311
  if self.rank == 0:
312
  self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)
 
314
  logger.info("\n" + summary)
315
  synchronize()
316
 
317
+ self.save_ckpt("last_epoch", update_best_ckpt)
318
+ if self.save_history_ckpt:
319
+ self.save_ckpt(f"epoch_{self.epoch + 1}")
320
 
321
  def save_ckpt(self, ckpt_name, update_best_ckpt=False):
322
  if self.rank == 0:
 
326
  "start_epoch": self.epoch + 1,
327
  "model": save_model.state_dict(),
328
  "optimizer": self.optimizer.state_dict(),
329
+ "best_ap": self.best_ap,
330
  }
331
  save_checkpoint(
332
  ckpt_state,
yolox/data/data_augment.py CHANGED
@@ -39,9 +39,7 @@ def get_aug_params(value, center=0):
39
  else:
40
  raise ValueError(
41
  "Affine params should be either a sequence containing two values\
42
- or single float values. Got {}".format(
43
- value
44
- )
45
  )
46
 
47
 
 
39
  else:
40
  raise ValueError(
41
  "Affine params should be either a sequence containing two values\
42
+ or single float values. Got {}".format(value)
 
 
43
  )
44
 
45
 
yolox/exp/yolox_base.py CHANGED
@@ -17,57 +17,94 @@ class Exp(BaseExp):
17
  super().__init__()
18
 
19
  # ---------------- model config ---------------- #
 
20
  self.num_classes = 80
 
21
  self.depth = 1.00
 
22
  self.width = 1.00
23
- self.act = 'silu'
 
24
 
25
  # ---------------- dataloader config ---------------- #
26
  # set worker to 4 for shorter dataloader init time
 
27
  self.data_num_workers = 4
28
  self.input_size = (640, 640) # (height, width)
29
- # Actual multiscale ranges: [640-5*32, 640+5*32].
30
- # To disable multiscale training, set the
31
- # self.multiscale_range to 0.
32
  self.multiscale_range = 5
33
  # You can uncomment this line to specify a multiscale range
34
  # self.random_size = (14, 26)
 
35
  self.data_dir = None
 
36
  self.train_ann = "instances_train2017.json"
 
37
  self.val_ann = "instances_val2017.json"
 
38
  self.test_ann = "instances_test2017.json"
39
 
40
  # --------------- transform config ----------------- #
 
41
  self.mosaic_prob = 1.0
 
42
  self.mixup_prob = 1.0
 
43
  self.hsv_prob = 1.0
 
44
  self.flip_prob = 0.5
 
45
  self.degrees = 10.0
 
46
  self.translate = 0.1
47
  self.mosaic_scale = (0.1, 2)
 
 
48
  self.mixup_scale = (0.5, 1.5)
 
49
  self.shear = 2.0
50
- self.enable_mixup = True
51
 
52
  # -------------- training config --------------------- #
 
53
  self.warmup_epochs = 5
 
54
  self.max_epoch = 300
 
55
  self.warmup_lr = 0
 
 
56
  self.basic_lr_per_img = 0.01 / 64.0
 
57
  self.scheduler = "yoloxwarmcos"
 
58
  self.no_aug_epochs = 15
59
- self.min_lr_ratio = 0.05
60
  self.ema = True
61
 
 
62
  self.weight_decay = 5e-4
 
63
  self.momentum = 0.9
 
 
64
  self.print_interval = 10
 
 
65
  self.eval_interval = 10
 
 
 
 
66
  self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
67
 
68
  # ----------------- testing config ------------------ #
 
69
  self.test_size = (640, 640)
 
 
70
  self.test_conf = 0.01
 
71
  self.nmsthre = 0.65
72
 
73
  def get_model(self):
 
17
  super().__init__()
18
 
19
  # ---------------- model config ---------------- #
20
+ # detect classes number of model
21
  self.num_classes = 80
22
+ # factor of model depth
23
  self.depth = 1.00
24
+ # factor of model width
25
  self.width = 1.00
26
+ # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
27
+ self.act = "silu"
28
 
29
  # ---------------- dataloader config ---------------- #
30
  # set worker to 4 for shorter dataloader init time
31
+ # If your training process cost many memory, reduce this value.
32
  self.data_num_workers = 4
33
  self.input_size = (640, 640) # (height, width)
34
+ # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
35
+ # To disable multiscale training, set the value to 0.
 
36
  self.multiscale_range = 5
37
  # You can uncomment this line to specify a multiscale range
38
  # self.random_size = (14, 26)
39
+ # dir of dataset images, if data_dir is None, this project will use `datasets` dir
40
  self.data_dir = None
41
+ # name of annotation file for training
42
  self.train_ann = "instances_train2017.json"
43
+ # name of annotation file for evaluation
44
  self.val_ann = "instances_val2017.json"
45
+ # name of annotation file for testing
46
  self.test_ann = "instances_test2017.json"
47
 
48
  # --------------- transform config ----------------- #
49
+ # prob of applying mosaic aug
50
  self.mosaic_prob = 1.0
51
+ # prob of applying mixup aug
52
  self.mixup_prob = 1.0
53
+ # prob of applying hsv aug
54
  self.hsv_prob = 1.0
55
+ # prob of applying flip aug
56
  self.flip_prob = 0.5
57
+ # rotation angle range, for example, if set to 2, the true range is (-2, 2)
58
  self.degrees = 10.0
59
+ # translate range, for example, if set to 0.1, the true range is (-0.1, 0.1)
60
  self.translate = 0.1
61
  self.mosaic_scale = (0.1, 2)
62
+ # apply mixup aug or not
63
+ self.enable_mixup = True
64
  self.mixup_scale = (0.5, 1.5)
65
+ # shear angle range, for example, if set to 2, the true range is (-2, 2)
66
  self.shear = 2.0
 
67
 
68
  # -------------- training config --------------------- #
69
+ # epoch number used for warmup
70
  self.warmup_epochs = 5
71
+ # max training epoch
72
  self.max_epoch = 300
73
+ # minimum learning rate during warmup
74
  self.warmup_lr = 0
75
+ self.min_lr_ratio = 0.05
76
+ # learning rate for one image. During traing, lr will multiply batchsize.
77
  self.basic_lr_per_img = 0.01 / 64.0
78
+ # name of LRScheduler
79
  self.scheduler = "yoloxwarmcos"
80
+ # last #epoch to close augmention like mosaic
81
  self.no_aug_epochs = 15
82
+ # apply EMA during training
83
  self.ema = True
84
 
85
+ # weight decay of optimizer
86
  self.weight_decay = 5e-4
87
+ # momentum of optimizer
88
  self.momentum = 0.9
89
+ # log period in iter, for example,
90
+ # if set to 1, user could see log every iteration.
91
  self.print_interval = 10
92
+ # eval period in epoch, for example,
93
+ # if set to 1, model will be evaluate after every epoch.
94
  self.eval_interval = 10
95
+ # save history checkpoint or not.
96
+ # If set to False, yolox will only save latest and best ckpt.
97
+ self.save_history_ckpt = True
98
+ # name of experiment
99
  self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
100
 
101
  # ----------------- testing config ------------------ #
102
+ # output image size during evaluation/test
103
  self.test_size = (640, 640)
104
+ # confidence threshold during evalulation/test,
105
+ # boxes whose scores are less than test_conf will be filtered
106
  self.test_conf = 0.01
107
+ # nms threshold
108
  self.nmsthre = 0.65
109
 
110
  def get_model(self):