import os import glob import json import pandas as pd class DataExtractor: def __init__(self, json_folder_path, image_root_directory): self.json_folder_path = json_folder_path self.image_root_directory = image_root_directory def extract_json_data(self): extracted_data = [] for filename in os.listdir(self.json_folder_path): if filename.endswith(".json"): with open(os.path.join(self.json_folder_path, filename), 'r') as json_file: data = json.load(json_file) if 'query' in data and 'images' in data: query = data['query'] images = data['images'] for image_data in images: extracted_data.append({ 'Class': query, 'id': image_data['Id'], 'Image_URL': image_data['url'], 'Title': image_data['title'], 'Page_URL': image_data['page_url'] }) return pd.DataFrame(extracted_data) def extract_image_paths(self): extracted_data = [] image_files = glob.glob(os.path.join(self.image_root_directory, '**', '*.jpg'), recursive=True) for image_file in image_files: class_name = os.path.basename(os.path.dirname(image_file)) id_name = os.path.splitext(os.path.basename(image_file))[0] extracted_data.append({ 'Class': class_name, 'id': id_name, 'Image_Path': image_file }) return pd.DataFrame(extracted_data) def concat_data(self): json_data = self.extract_json_data() image_data = self.extract_image_paths() combined_data = pd.merge(json_data, image_data, on=['id'], how='inner') paths = combined_data['Image_Path'] print(paths) return combined_data, paths