Spaces:
Runtime error
Runtime error
import os | |
import glob | |
import json | |
import pandas as pd | |
class DataExtractor: | |
def __init__(self, json_folder_path, image_root_directory): | |
self.json_folder_path = json_folder_path | |
self.image_root_directory = image_root_directory | |
def extract_json_data(self): | |
extracted_data = [] | |
for filename in os.listdir(self.json_folder_path): | |
if filename.endswith(".json"): | |
with open(os.path.join(self.json_folder_path, filename), 'r') as json_file: | |
data = json.load(json_file) | |
if 'query' in data and 'images' in data: | |
query = data['query'] | |
images = data['images'] | |
for image_data in images: | |
extracted_data.append({ | |
'Class': query, | |
'id': image_data['Id'], | |
'Image_URL': image_data['url'], | |
'Title': image_data['title'], | |
'Page_URL': image_data['page_url'] | |
}) | |
return pd.DataFrame(extracted_data) | |
def extract_image_paths(self): | |
extracted_data = [] | |
image_files = glob.glob(os.path.join(self.image_root_directory, '**', '*.jpg'), recursive=True) | |
for image_file in image_files: | |
class_name = os.path.basename(os.path.dirname(image_file)) | |
id_name = os.path.splitext(os.path.basename(image_file))[0] | |
extracted_data.append({ | |
'Class': class_name, | |
'id': id_name, | |
'Image_Path': image_file | |
}) | |
return pd.DataFrame(extracted_data) | |
def concat_data(self): | |
json_data = self.extract_json_data() | |
image_data = self.extract_image_paths() | |
combined_data = pd.merge(json_data, image_data, on=['id'], how='inner') | |
paths = combined_data['Image_Path'] | |
print(paths) | |
return combined_data, paths | |