DDP `WORLD_SIZE`-safe dataloader workers (#5631)
Browse files* WORLD_SIZE-safe workers
* Update with DDP comment
- train.py +2 -2
- utils/datasets.py +2 -1
train.py
CHANGED
@@ -266,7 +266,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
266 |
stopper = EarlyStopping(patience=opt.patience)
|
267 |
compute_loss = ComputeLoss(model) # init loss class
|
268 |
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
|
269 |
-
f'Using {train_loader.num_workers} dataloader workers\n'
|
270 |
f"Logging results to {colorstr('bold', save_dir)}\n"
|
271 |
f'Starting training for {epochs} epochs...')
|
272 |
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
|
@@ -460,7 +460,7 @@ def parse_opt(known=False):
|
|
460 |
parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
|
461 |
parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
|
462 |
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
|
463 |
-
parser.add_argument('--workers', type=int, default=8, help='
|
464 |
parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
|
465 |
parser.add_argument('--name', default='exp', help='save to project/name')
|
466 |
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
|
|
|
266 |
stopper = EarlyStopping(patience=opt.patience)
|
267 |
compute_loss = ComputeLoss(model) # init loss class
|
268 |
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
|
269 |
+
f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n'
|
270 |
f"Logging results to {colorstr('bold', save_dir)}\n"
|
271 |
f'Starting training for {epochs} epochs...')
|
272 |
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
|
|
|
460 |
parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
|
461 |
parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
|
462 |
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
|
463 |
+
parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)')
|
464 |
parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
|
465 |
parser.add_argument('--name', default='exp', help='save to project/name')
|
466 |
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
|
utils/datasets.py
CHANGED
@@ -34,6 +34,7 @@ from utils.torch_utils import torch_distributed_zero_first
|
|
34 |
HELP_URL = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
|
35 |
IMG_FORMATS = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng', 'webp', 'mpo'] # acceptable image suffixes
|
36 |
VID_FORMATS = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv'] # acceptable video suffixes
|
|
|
37 |
NUM_THREADS = min(8, os.cpu_count()) # number of multiprocessing threads
|
38 |
|
39 |
# Get orientation exif tag
|
@@ -107,7 +108,7 @@ def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=Non
|
|
107 |
prefix=prefix)
|
108 |
|
109 |
batch_size = min(batch_size, len(dataset))
|
110 |
-
nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, workers]) # number of workers
|
111 |
sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None
|
112 |
loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader
|
113 |
# Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader()
|
|
|
34 |
HELP_URL = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
|
35 |
IMG_FORMATS = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng', 'webp', 'mpo'] # acceptable image suffixes
|
36 |
VID_FORMATS = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv'] # acceptable video suffixes
|
37 |
+
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) # DPP
|
38 |
NUM_THREADS = min(8, os.cpu_count()) # number of multiprocessing threads
|
39 |
|
40 |
# Get orientation exif tag
|
|
|
108 |
prefix=prefix)
|
109 |
|
110 |
batch_size = min(batch_size, len(dataset))
|
111 |
+
nw = min([os.cpu_count() // WORLD_SIZE, batch_size if batch_size > 1 else 0, workers]) # number of workers
|
112 |
sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None
|
113 |
loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader
|
114 |
# Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader()
|