glenn-jocher commited on
Commit
597ed4c
·
unverified ·
1 Parent(s): 1aa2b67

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +17 -12
train.py CHANGED
@@ -79,7 +79,6 @@ def train(hyp):
79
  # Create model
80
  model = Model(opt.cfg).to(device)
81
  assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc'])
82
-
83
 
84
  # Image sizes
85
  gs = int(max(model.stride)) # grid size (max stride)
@@ -133,7 +132,13 @@ def train(hyp):
133
  with open(results_file, 'w') as file:
134
  file.write(ckpt['training_results']) # write results.txt
135
 
 
136
  start_epoch = ckpt['epoch'] + 1
 
 
 
 
 
137
  del ckpt
138
 
139
  # Mixed precision training https://github.com/NVIDIA/apex
@@ -147,6 +152,15 @@ def train(hyp):
147
  # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
148
  # plot_lr_scheduler(optimizer, scheduler, epochs)
149
 
 
 
 
 
 
 
 
 
 
150
  # Trainloader
151
  dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
152
  hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect)
@@ -155,7 +169,7 @@ def train(hyp):
155
 
156
  # Testloader
157
  testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt,
158
- hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0]
159
 
160
  # Model parameters
161
  hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
@@ -164,15 +178,6 @@ def train(hyp):
164
  model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou)
165
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
166
  model.names = data_dict['names']
167
-
168
- # Initialize distributed training
169
- if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
170
- dist.init_process_group(backend='nccl', # distributed backend
171
- init_method='tcp://127.0.0.1:9999', # init method
172
- world_size=1, # number of nodes
173
- rank=0) # node rank
174
- model = torch.nn.parallel.DistributedDataParallel(model)
175
- # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
176
 
177
  # Class frequency
178
  labels = np.concatenate(dataset.labels, 0)
@@ -373,7 +378,7 @@ if __name__ == '__main__':
373
  parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%')
374
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
375
  opt = parser.parse_args()
376
- opt.weights = last if opt.resume else opt.weights
377
  opt.cfg = check_file(opt.cfg) # check file
378
  opt.data = check_file(opt.data) # check file
379
  print(opt)
 
79
  # Create model
80
  model = Model(opt.cfg).to(device)
81
  assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc'])
 
82
 
83
  # Image sizes
84
  gs = int(max(model.stride)) # grid size (max stride)
 
132
  with open(results_file, 'w') as file:
133
  file.write(ckpt['training_results']) # write results.txt
134
 
135
+ # epochs
136
  start_epoch = ckpt['epoch'] + 1
137
+ if epochs < start_epoch:
138
+ print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
139
+ (opt.weights, ckpt['epoch'], epochs))
140
+ epochs += ckpt['epoch'] # finetune additional epochs
141
+
142
  del ckpt
143
 
144
  # Mixed precision training https://github.com/NVIDIA/apex
 
152
  # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
153
  # plot_lr_scheduler(optimizer, scheduler, epochs)
154
 
155
+ # Initialize distributed training
156
+ if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
157
+ dist.init_process_group(backend='nccl', # distributed backend
158
+ init_method='tcp://127.0.0.1:9999', # init method
159
+ world_size=1, # number of nodes
160
+ rank=0) # node rank
161
+ model = torch.nn.parallel.DistributedDataParallel(model)
162
+ # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
163
+
164
  # Trainloader
165
  dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
166
  hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect)
 
169
 
170
  # Testloader
171
  testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt,
172
+ hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0]
173
 
174
  # Model parameters
175
  hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
 
178
  model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou)
179
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
180
  model.names = data_dict['names']
 
 
 
 
 
 
 
 
 
181
 
182
  # Class frequency
183
  labels = np.concatenate(dataset.labels, 0)
 
378
  parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%')
379
  parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
380
  opt = parser.parse_args()
381
+ opt.weights = last if opt.resume and not opt.weights else opt.weights
382
  opt.cfg = check_file(opt.cfg) # check file
383
  opt.data = check_file(opt.data) # check file
384
  print(opt)