File size: 2,036 Bytes
2519bba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import glob
import json
import pandas as pd

class DataExtractor:
    def __init__(self, json_folder_path, image_root_directory):
        self.json_folder_path = json_folder_path
        self.image_root_directory = image_root_directory

    def extract_json_data(self):
        extracted_data = []
        for filename in os.listdir(self.json_folder_path):
            if filename.endswith(".json"):
                with open(os.path.join(self.json_folder_path, filename), 'r') as json_file:
                    data = json.load(json_file)
                    if 'query' in data and 'images' in data:
                        query = data['query']
                        images = data['images']
                        for image_data in images:
                            extracted_data.append({
                                'Class': query,
                                'id': image_data['Id'],
                                'Image_URL': image_data['url'],
                                'Title': image_data['title'],
                                'Page_URL': image_data['page_url']
                            })
        return pd.DataFrame(extracted_data)

    def extract_image_paths(self):
        extracted_data = []
        image_files = glob.glob(os.path.join(self.image_root_directory, '**', '*.jpg'), recursive=True)
        for image_file in image_files:
            class_name = os.path.basename(os.path.dirname(image_file))
            id_name = os.path.splitext(os.path.basename(image_file))[0]
            extracted_data.append({
                'Class': class_name,
                'id': id_name,
                'Image_Path': image_file
            })
        return pd.DataFrame(extracted_data)

    def concat_data(self):
        json_data = self.extract_json_data()
        image_data = self.extract_image_paths()

        combined_data = pd.merge(json_data, image_data, on=['id'], how='inner')
        paths = combined_data['Image_Path']
        print(paths)
        return combined_data, paths