Tensor initialization on device improvements (#6959)
Browse files* Update common.py speed improvements
Eliminate .to() ops where possible for reduced data transfer overhead. Primarily affects warmup and PyTorch Hub inference.
* Updates
* Updates
* Update detect.py
* Update val.py
- models/common.py +1 -1
- val.py +3 -3
models/common.py
CHANGED
@@ -466,7 +466,7 @@ class DetectMultiBackend(nn.Module):
|
|
466 |
# Warmup model by running inference once
|
467 |
if self.pt or self.jit or self.onnx or self.engine: # warmup types
|
468 |
if isinstance(self.device, torch.device) and self.device.type != 'cpu': # only warmup GPU models
|
469 |
-
im = torch.zeros(*imgsz
|
470 |
self.forward(im) # warmup
|
471 |
|
472 |
@staticmethod
|
|
|
466 |
# Warmup model by running inference once
|
467 |
if self.pt or self.jit or self.onnx or self.engine: # warmup types
|
468 |
if isinstance(self.device, torch.device) and self.device.type != 'cpu': # only warmup GPU models
|
469 |
+
im = torch.zeros(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input
|
470 |
self.forward(im) # warmup
|
471 |
|
472 |
@staticmethod
|
val.py
CHANGED
@@ -87,7 +87,7 @@ def process_batch(detections, labels, iouv):
|
|
87 |
matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
|
88 |
# matches = matches[matches[:, 2].argsort()[::-1]]
|
89 |
matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
|
90 |
-
matches = torch.
|
91 |
correct[matches[:, 1].long()] = matches[:, 2:3] >= iouv
|
92 |
return correct
|
93 |
|
@@ -155,7 +155,7 @@ def run(data,
|
|
155 |
cuda = device.type != 'cpu'
|
156 |
is_coco = isinstance(data.get('val'), str) and data['val'].endswith('coco/val2017.txt') # COCO dataset
|
157 |
nc = 1 if single_cls else int(data['nc']) # number of classes
|
158 |
-
iouv = torch.linspace(0.5, 0.95, 10
|
159 |
niou = iouv.numel()
|
160 |
|
161 |
# Dataloader
|
@@ -196,7 +196,7 @@ def run(data,
|
|
196 |
loss += compute_loss([x.float() for x in train_out], targets)[1] # box, obj, cls
|
197 |
|
198 |
# NMS
|
199 |
-
targets[:, 2:] *= torch.
|
200 |
lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else [] # for autolabelling
|
201 |
t3 = time_sync()
|
202 |
out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls)
|
|
|
87 |
matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
|
88 |
# matches = matches[matches[:, 2].argsort()[::-1]]
|
89 |
matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
|
90 |
+
matches = torch.from_numpy(matches).to(iouv.device)
|
91 |
correct[matches[:, 1].long()] = matches[:, 2:3] >= iouv
|
92 |
return correct
|
93 |
|
|
|
155 |
cuda = device.type != 'cpu'
|
156 |
is_coco = isinstance(data.get('val'), str) and data['val'].endswith('coco/val2017.txt') # COCO dataset
|
157 |
nc = 1 if single_cls else int(data['nc']) # number of classes
|
158 |
+
iouv = torch.linspace(0.5, 0.95, 10, device=device) # iou vector for [email protected]:0.95
|
159 |
niou = iouv.numel()
|
160 |
|
161 |
# Dataloader
|
|
|
196 |
loss += compute_loss([x.float() for x in train_out], targets)[1] # box, obj, cls
|
197 |
|
198 |
# NMS
|
199 |
+
targets[:, 2:] *= torch.tensor((width, height, width, height), device=device) # to pixels
|
200 |
lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else [] # for autolabelling
|
201 |
t3 = time_sync()
|
202 |
out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls)
|