File size: 5,069 Bytes
9b5fe77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import os
import requests
from PIL import Image
from io import BytesIO
from datasets import Dataset

def modify_dataframe_and_extract_data(df):
    data_list = []
    for _, row in df.iterrows():
        messages = []
        for i in range(1, 5):
            user_question = row[f'Question{i}']
            user_answer = row[f'Answer{i}']
            if user_question:
                message_content = [{'index': None, 'text': user_question, 'type': 'text'}]
                if i == 1:
                    message_content.append({'index': 0, 'text': None, 'type': 'image'})
                messages.append({'content': message_content, 'role': 'user'})
                if user_answer:
                    messages.append({'content': [{'index': None, 'text': user_answer, 'type': 'text'}], 'role': 'assistant'})
        image = Image.open(row['imagePath'])
        data_list.append({'messages': messages, 'images': [image]})
    return {'messages': [data['messages'] for data in data_list], 'images': [data['images'] for data in data_list]}


def download_and_resize_images(df, image_dir, target_size=(250, 250)):
    image_paths = []
    for index, row in df.iterrows():
        image_url = row['primaryImageLink']
        object_id = row['objectID']
        if image_url:
            # Extract filename from the URL
            filename = os.path.join(image_dir, f"{object_id}.jpg")
            # Download image from the URL
            response = requests.get(image_url)
            if response.status_code == 200:
                # Open the image using PIL
                image = Image.open(BytesIO(response.content))
                # Resize the image
                image = image.resize(target_size)
                # Save the resized image
                image.save(filename)
                image_paths.append(filename)
            else:
                print(f"Failed to download image from {image_url}")
                image_paths.append(None)
        else:
            image_paths.append(None)
    return image_paths

def split_data_dict(data_dict, train_ratio=0.7, test_ratio=0.2, val_ratio=0.1):
    assert train_ratio + test_ratio + val_ratio == 1.0, "Ratios must sum up to 1.0"
    
    total_samples = len(data_dict['messages'])
    train_size = int(total_samples * train_ratio)
    test_size = int(total_samples * test_ratio)
    val_size = int(total_samples * val_ratio)

    train_data_dict = {
        'messages': data_dict['messages'][:train_size],
        'images': data_dict['images'][:train_size]
    }
    test_data_dict = {
        'messages': data_dict['messages'][train_size:train_size + test_size],
        'images': data_dict['images'][train_size:train_size + test_size]
    }
    val_data_dict = {
        'messages': data_dict['messages'][-val_size:],
        'images': data_dict['images'][-val_size:]
    }

    return train_data_dict, test_data_dict, val_data_dict


def save_data_dict_as_arrow(data_dict, file_path):
    # Convert the dictionary to a Dataset object
    dataset = Dataset.from_dict(data_dict)
    
    # Save the dataset to an Arrow file
    dataset.save_to_disk(file_path)

if __name__ == "__main__":
    # Example usage:

    # df = pd.read_csv("/data/data_set_metmuseum.csv")
    # df1 = df[['objectID', 'primaryImageLink', 'Question1', 'Answer1', 'Question2', 'Answer2', 'Question3', 'Answer3', 'Question4', 'Answer4']]
    # df2 = df1.sample(frac=1)
    # df3 = df2.head(250)

    # df4 = df3.copy()

    df4 = pd.read_csv("sampled_data250.csv")
    paths = ['input_dataset', os.path.join('input_dataset', 'images'), 'output_dataset']
    for path in paths:
        if not os.path.exists(path):
            os.makedirs(path)

    # Call the function to download and resize images
    image_dir = 'input_dataset/images'
    # image_paths = download_and_resize_images(df4, image_dir)

    # Update the DataFrame with the resized image paths
    # new_df = df4.copy()  # Create a copy of the original DataFrame
    # df4['imagePath'] = image_paths  # Add a new column 'imagePath' containing the resized image paths
    # df4 = df4.drop(['primaryImageLink'], axis=1)

    # Call the function to modify the DataFrame and extract data
    data_dict = modify_dataframe_and_extract_data(df4)
    # split data_dict into train, test, valid
    train_data_dict, test_data_dict, val_data_dict = split_data_dict(data_dict, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2)

    # save these as arrow dataset
    save_data_dict_as_arrow(train_data_dict, os.path.join('output_dataset', 'train.arrow'))
    save_data_dict_as_arrow(test_data_dict, os.path.join('output_dataset', 'test.arrow'))
    save_data_dict_as_arrow(val_data_dict, os.path.join('output_dataset', 'val.arrow'))

    # save to zip format
    import shutil
    shutil.make_archive("/content/input_dataset", "zip", "/content/input_dataset")
    shutil.make_archive("/content/output_dataset", "zip", "/content/output_dataset")
    
    # read arrow from disk
    test_data = Dataset.load_from_disk("output_dataset/test.arrow")
    test_data