Spaces:
Runtime error
Runtime error
File size: 2,122 Bytes
e086001 e0cedf5 e086001 e0cedf5 e086001 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import imagehash
import os
from collections import deque
from PIL import Image
from tqdm import tqdm
def find_similar_images(
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
):
snapshots_files = sorted(os.listdir(base_dir))
hash_dict = {}
hash_queue = deque([], maxlen=queue_len)
duplicates = []
num_duplicates = 0
print("---" * 5, "Finding similar files", "---" * 5)
with tqdm(snapshots_files) as t:
for file in t:
read_file = Image.open(os.path.join(base_dir, file))
comp_hash = hashfunc(read_file, hash_size=hash_size)
duplicate = False
if comp_hash not in hash_dict:
hash_dict[comp_hash] = file
# Compare with hash queue to find out potential duplicates
for img_hash in hash_queue:
if img_hash - comp_hash <= threshold:
duplicate = True
break
if not duplicate:
hash_queue.append(comp_hash)
else:
duplicate = True
if duplicate:
duplicates.append(file)
num_duplicates += 1
t.set_postfix_str(f"Duplicate files: {num_duplicates}")
return hash_dict, duplicates
def remove_duplicates(
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
):
_, duplicates = find_similar_images(
base_dir,
hash_size=hash_size,
hashfunc=hashfunc,
queue_len=queue_len,
threshold=threshold,
)
if not len(duplicates):
print("No duplicates found!")
else:
print("Removing duplicates...")
for dup_file in duplicates:
file_path = os.path.join(base_dir, dup_file)
if os.path.exists(file_path):
os.remove(file_path)
else:
print("Filepath: ", file_path, "does not exists.")
print("All duplicates removed!")
print("***" * 10, "\n")
if __name__ == "__main__":
remove_duplicates("sample_1")
|