Commit
·
520f5de
1
Parent(s):
f310ca3
Label caching foundational re-write #306
Browse files- utils/datasets.py +51 -54
utils/datasets.py
CHANGED
@@ -26,6 +26,11 @@ for orientation in ExifTags.TAGS.keys():
|
|
26 |
break
|
27 |
|
28 |
|
|
|
|
|
|
|
|
|
|
|
29 |
def exif_size(img):
|
30 |
# Returns exif-corrected PIL size
|
31 |
s = img.size # (width, height)
|
@@ -280,7 +285,7 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
280 |
def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
|
281 |
cache_images=False, single_cls=False, stride=32, pad=0.0):
|
282 |
try:
|
283 |
-
f = []
|
284 |
for p in path if isinstance(path, list) else [path]:
|
285 |
p = str(Path(p)) # os-agnostic
|
286 |
parent = str(Path(p).parent) + os.sep
|
@@ -292,7 +297,6 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
292 |
f += glob.iglob(p + os.sep + '*.*')
|
293 |
else:
|
294 |
raise Exception('%s does not exist' % p)
|
295 |
-
path = p # *.npy dir
|
296 |
self.img_files = [x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats]
|
297 |
except Exception as e:
|
298 |
raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url))
|
@@ -314,20 +318,22 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
314 |
self.stride = stride
|
315 |
|
316 |
# Define labels
|
317 |
-
self.label_files = [x.replace('images', 'labels').replace(os.path.splitext(x)[-1], '.txt')
|
318 |
-
|
319 |
-
|
320 |
-
#
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
np.savetxt(sp, s, fmt='%g') # overwrites existing (if any)
|
329 |
|
330 |
-
|
|
|
|
|
|
|
331 |
|
332 |
# Rectangular Training https://github.com/ultralytics/yolov3/issues/232
|
333 |
if self.rect:
|
@@ -353,33 +359,11 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
353 |
self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
|
354 |
|
355 |
# Cache labels
|
356 |
-
self.imgs = [None] * n
|
357 |
-
self.labels = [np.zeros((0, 5), dtype=np.float32)] * n
|
358 |
create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False
|
359 |
nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate
|
360 |
-
np_labels_path = str(Path(self.label_files[0]).parent) + '.npy' # saved labels in *.npy file
|
361 |
-
if os.path.isfile(np_labels_path):
|
362 |
-
s = np_labels_path # print string
|
363 |
-
x = np.load(np_labels_path, allow_pickle=True)
|
364 |
-
if len(x) == n:
|
365 |
-
self.labels = x
|
366 |
-
labels_loaded = True
|
367 |
-
else:
|
368 |
-
s = path.replace('images', 'labels')
|
369 |
-
|
370 |
pbar = tqdm(self.label_files)
|
371 |
for i, file in enumerate(pbar):
|
372 |
-
|
373 |
-
l = self.labels[i]
|
374 |
-
# np.savetxt(file, l, '%g') # save *.txt from *.npy file
|
375 |
-
else:
|
376 |
-
try:
|
377 |
-
with open(file, 'r') as f:
|
378 |
-
l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32)
|
379 |
-
except:
|
380 |
-
nm += 1 # print('missing labels for image %s' % self.img_files[i]) # file missing
|
381 |
-
continue
|
382 |
-
|
383 |
if l.shape[0]:
|
384 |
assert l.shape[1] == 5, '> 5 label columns: %s' % file
|
385 |
assert (l >= 0).all(), 'negative labels: %s' % file
|
@@ -425,15 +409,13 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
425 |
ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty
|
426 |
# os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove
|
427 |
|
428 |
-
pbar.desc = '
|
429 |
-
|
430 |
-
assert nf > 0
|
431 |
-
if not labels_loaded and n > 1000:
|
432 |
-
print('Saving labels to %s for faster future loading' % np_labels_path)
|
433 |
-
np.save(np_labels_path, self.labels) # save for next time
|
434 |
|
435 |
# Cache images into memory for faster training (WARNING: large datasets may exceed system RAM)
|
436 |
-
|
|
|
437 |
gb = 0 # Gigabytes of cached images
|
438 |
pbar = tqdm(range(len(self.img_files)), desc='Caching images')
|
439 |
self.img_hw0, self.img_hw = [None] * n, [None] * n
|
@@ -442,15 +424,30 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
442 |
gb += self.imgs[i].nbytes
|
443 |
pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9)
|
444 |
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
|
455 |
def __len__(self):
|
456 |
return len(self.img_files)
|
|
|
26 |
break
|
27 |
|
28 |
|
29 |
+
def get_hash(files):
|
30 |
+
# Returns a single hash value of a list of files
|
31 |
+
return sum(os.path.getsize(f) for f in files)
|
32 |
+
|
33 |
+
|
34 |
def exif_size(img):
|
35 |
# Returns exif-corrected PIL size
|
36 |
s = img.size # (width, height)
|
|
|
285 |
def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
|
286 |
cache_images=False, single_cls=False, stride=32, pad=0.0):
|
287 |
try:
|
288 |
+
f = [] # image files
|
289 |
for p in path if isinstance(path, list) else [path]:
|
290 |
p = str(Path(p)) # os-agnostic
|
291 |
parent = str(Path(p).parent) + os.sep
|
|
|
297 |
f += glob.iglob(p + os.sep + '*.*')
|
298 |
else:
|
299 |
raise Exception('%s does not exist' % p)
|
|
|
300 |
self.img_files = [x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats]
|
301 |
except Exception as e:
|
302 |
raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url))
|
|
|
318 |
self.stride = stride
|
319 |
|
320 |
# Define labels
|
321 |
+
self.label_files = [x.replace('images', 'labels').replace(os.path.splitext(x)[-1], '.txt') for x in
|
322 |
+
self.img_files]
|
323 |
+
|
324 |
+
# Check cache
|
325 |
+
cache_path = str(Path(self.label_files[0]).parent) + '.cache' # cached labels
|
326 |
+
if os.path.isfile(cache_path):
|
327 |
+
cache = torch.load(cache_path) # load
|
328 |
+
if cache['hash'] != get_hash(self.label_files + self.img_files): # dataset changed
|
329 |
+
cache = self.cache_labels(cache_path) # re-cache
|
330 |
+
else:
|
331 |
+
cache = self.cache_labels(cache_path) # cache
|
|
|
332 |
|
333 |
+
# Get labels
|
334 |
+
labels, shapes = zip(*[cache[x] for x in self.img_files])
|
335 |
+
self.shapes = np.array(shapes, dtype=np.float64)
|
336 |
+
self.labels = list(labels)
|
337 |
|
338 |
# Rectangular Training https://github.com/ultralytics/yolov3/issues/232
|
339 |
if self.rect:
|
|
|
359 |
self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
|
360 |
|
361 |
# Cache labels
|
|
|
|
|
362 |
create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False
|
363 |
nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
pbar = tqdm(self.label_files)
|
365 |
for i, file in enumerate(pbar):
|
366 |
+
l = self.labels[i] # label
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
if l.shape[0]:
|
368 |
assert l.shape[1] == 5, '> 5 label columns: %s' % file
|
369 |
assert (l >= 0).all(), 'negative labels: %s' % file
|
|
|
409 |
ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty
|
410 |
# os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove
|
411 |
|
412 |
+
pbar.desc = 'Scanning labels %s (%g found, %g missing, %g empty, %g duplicate, for %g images)' % (
|
413 |
+
cache_path, nf, nm, ne, nd, n)
|
414 |
+
assert nf > 0, 'No labels found in %s. See %s' % (os.path.dirname(file) + os.sep, help_url)
|
|
|
|
|
|
|
415 |
|
416 |
# Cache images into memory for faster training (WARNING: large datasets may exceed system RAM)
|
417 |
+
self.imgs = [None] * n
|
418 |
+
if cache_images:
|
419 |
gb = 0 # Gigabytes of cached images
|
420 |
pbar = tqdm(range(len(self.img_files)), desc='Caching images')
|
421 |
self.img_hw0, self.img_hw = [None] * n, [None] * n
|
|
|
424 |
gb += self.imgs[i].nbytes
|
425 |
pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9)
|
426 |
|
427 |
+
def cache_labels(self, path='labels.cache'):
|
428 |
+
# Cache dataset labels, check images and read shapes
|
429 |
+
x = {} # dict
|
430 |
+
pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files))
|
431 |
+
for (img, label) in pbar:
|
432 |
+
try:
|
433 |
+
l = []
|
434 |
+
image = Image.open(img)
|
435 |
+
image.verify() # PIL verify
|
436 |
+
# _ = io.imread(img) # skimage verify (from skimage import io)
|
437 |
+
shape = exif_size(image) # image size
|
438 |
+
if os.path.isfile(label):
|
439 |
+
with open(label, 'r') as f:
|
440 |
+
l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32) # labels
|
441 |
+
if len(l) == 0:
|
442 |
+
l = np.zeros((0, 5), dtype=np.float32)
|
443 |
+
x[img] = [l, shape]
|
444 |
+
except Exception as e:
|
445 |
+
x[img] = None
|
446 |
+
print('WARNING: %s: %s' % (img, e))
|
447 |
+
|
448 |
+
x['hash'] = get_hash(self.label_files + self.img_files)
|
449 |
+
torch.save(x, path) # save for next time
|
450 |
+
return x
|
451 |
|
452 |
def __len__(self):
|
453 |
return len(self.img_files)
|