FaceNet / data_extractor.py
mdirshad09's picture
Upload 8 files
2519bba
import os
import glob
import json
import pandas as pd
class DataExtractor:
def __init__(self, json_folder_path, image_root_directory):
self.json_folder_path = json_folder_path
self.image_root_directory = image_root_directory
def extract_json_data(self):
extracted_data = []
for filename in os.listdir(self.json_folder_path):
if filename.endswith(".json"):
with open(os.path.join(self.json_folder_path, filename), 'r') as json_file:
data = json.load(json_file)
if 'query' in data and 'images' in data:
query = data['query']
images = data['images']
for image_data in images:
extracted_data.append({
'Class': query,
'id': image_data['Id'],
'Image_URL': image_data['url'],
'Title': image_data['title'],
'Page_URL': image_data['page_url']
})
return pd.DataFrame(extracted_data)
def extract_image_paths(self):
extracted_data = []
image_files = glob.glob(os.path.join(self.image_root_directory, '**', '*.jpg'), recursive=True)
for image_file in image_files:
class_name = os.path.basename(os.path.dirname(image_file))
id_name = os.path.splitext(os.path.basename(image_file))[0]
extracted_data.append({
'Class': class_name,
'id': id_name,
'Image_Path': image_file
})
return pd.DataFrame(extracted_data)
def concat_data(self):
json_data = self.extract_json_data()
image_data = self.extract_image_paths()
combined_data = pd.merge(json_data, image_data, on=['id'], how='inner')
paths = combined_data['Image_Path']
print(paths)
return combined_data, paths