Rudra Rahul Chothe commited on
Commit
3cfebcb
·
verified ·
1 Parent(s): e6ab28d

Update src/preprocessing.py

Browse files
Files changed (1) hide show
  1. src/preprocessing.py +44 -62
src/preprocessing.py CHANGED
@@ -1,62 +1,44 @@
1
- import os
2
- import pickle
3
- from .feature_extractor import FeatureExtractor
4
- import time
5
- from tqdm import tqdm
6
-
7
- def precompute_embeddings(image_dir='data/images', output_path='data/embeddings.pkl'):
8
- # Initialize the feature extractor
9
- extractor = FeatureExtractor()
10
-
11
- embeddings = []
12
- image_paths = []
13
-
14
- # Get total number of valid images
15
- valid_images = [f for f in os.listdir(image_dir)
16
- if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
17
- total_images = len(valid_images)
18
-
19
- print(f"\nFound {total_images} images to process")
20
-
21
- # Estimate time (assuming ~1 second per image for EfficientNetB0)
22
- estimated_time = total_images * 1 # 1 second per image
23
- print(f"Estimated time: {estimated_time//60} minutes and {estimated_time%60} seconds\n")
24
-
25
- # Use tqdm for progress bar
26
- start_time = time.time()
27
- for idx, filename in enumerate(tqdm(valid_images, desc="Processing images")):
28
- if filename.endswith(('.png', '.jpg', '.jpeg')):
29
- img_path = os.path.join(image_dir, filename)
30
- try:
31
- # Show current image being processed
32
- print(f"\rProcessing image {idx+1}/{total_images}: {filename}", end="")
33
-
34
- embedding = extractor.extract_features(img_path)
35
- embeddings.append(embedding)
36
- image_paths.append(img_path)
37
-
38
- # Calculate and show remaining time
39
- elapsed_time = time.time() - start_time
40
- avg_time_per_image = elapsed_time / (idx + 1)
41
- remaining_images = total_images - (idx + 1)
42
- estimated_remaining_time = remaining_images * avg_time_per_image
43
-
44
- print(f" | Remaining time: {estimated_remaining_time//60:.0f}m {estimated_remaining_time%60:.0f}s")
45
-
46
- except Exception as e:
47
- print(f"\nError processing {filename}: {e}")
48
-
49
- # Save embeddings and paths
50
- with open(output_path, 'wb') as f:
51
- pickle.dump({'embeddings': embeddings, 'image_paths': image_paths}, f)
52
-
53
- total_time = time.time() - start_time
54
- print(f"\nProcessing complete!")
55
- print(f"Total time taken: {total_time//60:.0f} minutes and {total_time%60:.0f} seconds")
56
- print(f"Successfully processed {len(embeddings)}/{total_images} images")
57
- print(f"Embeddings saved to {output_path}")
58
-
59
- return embeddings, image_paths
60
-
61
- if __name__ == "__main__":
62
- precompute_embeddings()
 
1
+ import os
2
+ import pickle
3
+ from .feature_extractor import FeatureExtractor
4
+ import time
5
+ from tqdm import tqdm
6
+
7
+ def precompute_embeddings():
8
+ # Use absolute paths for Hugging Face Spaces
9
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10
+ image_dir = os.path.join(base_dir, 'data', 'images')
11
+ output_path = os.path.join(base_dir, 'data', 'embeddings.pkl')
12
+
13
+ # Create directories if they don't exist
14
+ os.makedirs(image_dir, exist_ok=True)
15
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
16
+
17
+ # Rest of your existing code...
18
+ extractor = FeatureExtractor()
19
+ embeddings = []
20
+ image_paths = []
21
+
22
+ valid_images = [f for f in os.listdir(image_dir)
23
+ if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
24
+ total_images = len(valid_images)
25
+
26
+ print(f"\nFound {total_images} images to process")
27
+
28
+ start_time = time.time()
29
+ for idx, filename in enumerate(tqdm(valid_images, desc="Processing images")):
30
+ img_path = os.path.join(image_dir, filename)
31
+ try:
32
+ embedding = extractor.extract_features(img_path)
33
+ embeddings.append(embedding)
34
+ image_paths.append(img_path)
35
+ except Exception as e:
36
+ print(f"\nError processing {filename}: {e}")
37
+
38
+ with open(output_path, 'wb') as f:
39
+ pickle.dump({'embeddings': embeddings, 'image_paths': image_paths}, f)
40
+
41
+ print(f"\nProcessing complete!")
42
+ print(f"Successfully processed {len(embeddings)}/{total_images} images")
43
+
44
+ return embeddings, image_paths