Autofix duplicate label handling (#5210)
Browse files* Autofix duplicate labels
PR changes duplicate label handling from report error and ignore image-label pair to report warning and autofix image-label pair.
This should fix this common issue for users and allow everyone to get started and get a model trained faster and easier than before.
* sign fix
* Cleanup
* Increment cache version
* all to any fix
- utils/datasets.py +12 -8
utils/datasets.py
CHANGED
@@ -375,7 +375,7 @@ def img2label_paths(img_paths):
|
|
375 |
|
376 |
class LoadImagesAndLabels(Dataset):
|
377 |
# YOLOv5 train_loader/val_loader, loads images and labels for training and validation
|
378 |
-
cache_version = 0.
|
379 |
|
380 |
def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
|
381 |
cache_images=False, single_cls=False, stride=32, pad=0.0, prefix=''):
|
@@ -897,7 +897,7 @@ def verify_image_label(args):
|
|
897 |
f.seek(-2, 2)
|
898 |
if f.read() != b'\xff\xd9': # corrupt JPEG
|
899 |
Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
|
900 |
-
msg = f'{prefix}WARNING: corrupt JPEG restored and saved
|
901 |
|
902 |
# verify labels
|
903 |
if os.path.isfile(lb_file):
|
@@ -909,11 +909,15 @@ def verify_image_label(args):
|
|
909 |
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l] # (cls, xy1...)
|
910 |
l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
|
911 |
l = np.array(l, dtype=np.float32)
|
912 |
-
|
913 |
-
|
914 |
-
assert
|
915 |
-
assert (l
|
916 |
-
assert
|
|
|
|
|
|
|
|
|
917 |
else:
|
918 |
ne = 1 # label empty
|
919 |
l = np.zeros((0, 5), dtype=np.float32)
|
@@ -923,7 +927,7 @@ def verify_image_label(args):
|
|
923 |
return im_file, l, shape, segments, nm, nf, ne, nc, msg
|
924 |
except Exception as e:
|
925 |
nc = 1
|
926 |
-
msg = f'{prefix}WARNING:
|
927 |
return [None, None, None, None, nm, nf, ne, nc, msg]
|
928 |
|
929 |
|
|
|
375 |
|
376 |
class LoadImagesAndLabels(Dataset):
|
377 |
# YOLOv5 train_loader/val_loader, loads images and labels for training and validation
|
378 |
+
cache_version = 0.6 # dataset labels *.cache version
|
379 |
|
380 |
def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
|
381 |
cache_images=False, single_cls=False, stride=32, pad=0.0, prefix=''):
|
|
|
897 |
f.seek(-2, 2)
|
898 |
if f.read() != b'\xff\xd9': # corrupt JPEG
|
899 |
Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
|
900 |
+
msg = f'{prefix}WARNING: {im_file}: corrupt JPEG restored and saved'
|
901 |
|
902 |
# verify labels
|
903 |
if os.path.isfile(lb_file):
|
|
|
909 |
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l] # (cls, xy1...)
|
910 |
l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
|
911 |
l = np.array(l, dtype=np.float32)
|
912 |
+
nl = len(l)
|
913 |
+
if nl:
|
914 |
+
assert l.shape[1] == 5, f'labels require 5 columns, {l.shape[1]} columns detected'
|
915 |
+
assert (l >= 0).all(), f'negative label values {l[l < 0]}'
|
916 |
+
assert (l[:, 1:] <= 1).all(), f'non-normalized or out of bounds coordinates {l[:, 1:][l[:, 1:] > 1]}'
|
917 |
+
l = np.unique(l, axis=0) # remove duplicate rows
|
918 |
+
if len(l) < nl:
|
919 |
+
segments = np.unique(segments, axis=0)
|
920 |
+
msg = f'{prefix}WARNING: {im_file}: {nl - len(l)} duplicate labels removed'
|
921 |
else:
|
922 |
ne = 1 # label empty
|
923 |
l = np.zeros((0, 5), dtype=np.float32)
|
|
|
927 |
return im_file, l, shape, segments, nm, nf, ne, nc, msg
|
928 |
except Exception as e:
|
929 |
nc = 1
|
930 |
+
msg = f'{prefix}WARNING: {im_file}: ignoring corrupt image/label: {e}'
|
931 |
return [None, None, None, None, nm, nf, ne, nc, msg]
|
932 |
|
933 |
|