import os import random import cv2 from datetime import datetime import logging # Set up logging configuration log_file = "sample_images.log" logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def detect_faces(image_path): # Load the pre-trained Haar Cascade model for face detection face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') # Read the image in grayscale image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) if image is None: return False # Detect faces in the image faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) # Return True if at least one face is detected return len(faces) > 0 def sample_images(input_folder, output_folder, sample_rate=0.2): # Ensure the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Initialize counters and start time total_files = 0 sampled_files = 0 start_time = datetime.now() # Walk through the input folder structure for root, dirs, files in os.walk(input_folder): relative_path = os.path.relpath(root, input_folder) output_subfolder = os.path.join(output_folder, relative_path) if not os.path.exists(output_subfolder): os.makedirs(output_subfolder) total_files += len(files) # Sample files in this directory sampled_files_this_batch = [] for file in files: if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')): input_file_path = os.path.join(root, file) if detect_faces(input_file_path): sampled_files_this_batch.append(file) sampled_files += len(sampled_files_this_batch) for file in files: if file in sampled_files_this_batch: input_file_path = os.path.join(root, file) output_file_path = os.path.join(output_subfolder, file) os.link(input_file_path, output_file_path) # Log the action logging.info(f"Sampled and copied {input_file_path} to {output_file_path}") elapsed_time = datetime.now() - start_time print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}") end_time = datetime.now() total_time = end_time - start_time logging.info(f"Total time taken: {total_time}") logging.info(f"Sampled {sampled_files} out of {total_files} files.") if __name__ == "__main__": input_folder = "EvalSet" output_folder = "resampledEvalSet" sample_images(input_folder, output_folder)