File size: 6,583 Bytes
59b2a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
'''
    This repository is used to prepare Bridge dataset
'''
import os, sys, shutil



def read_bridge_v2(dataset_path, train_store_path, test_store_path, test_dataset_lists, copyfile=True):
    # copyfile is True most of the time

    start_idx = 0
    target_lists = []
    prefix_len = len(dataset_path) + 1

    # Iterate all the folders inside
    for scene_name in sorted(os.listdir(dataset_path)):
        print("We are reading scene ", scene_name)
        scene_dir = os.path.join(dataset_path, scene_name)

        for task_name in sorted(os.listdir(scene_dir)):
            task_dir = os.path.join(scene_dir, task_name)

            for order_name in sorted(os.listdir(task_dir)):
                order_dir = os.path.join(task_dir, order_name)

                for time_clock in sorted(os.listdir(order_dir)):
                    if time_clock == "lmdb":
                        continue    # Skip lmdb folder
                    
                    time_dir = os.path.join(order_dir, time_clock, "raw", "traj_group0")
                    if not os.path.exists(time_dir):
                        print("time_dir does not exist for ", time_dir)
                        continue

                    for traj_name in sorted(os.listdir(time_dir)):
                        traj_path = os.path.join(time_dir, traj_name)
                        if not os.path.isdir(traj_path):
                            print("traj_path does not exist for ", traj_path)
                            continue
                        
                        # Directly move policy_out_file_path; just in case there is also valuable information there
                        policy_out_file_path = os.path.join(traj_path, "policy_out.pkl")
                        if not os.path.exists(policy_out_file_path):
                            continue

                        # Check the lang txt file
                        lang_txt_file_path = os.path.join(traj_path, "lang.txt")
                        if not os.path.exists(lang_txt_file_path):
                            continue


                        for img_name in sorted(os.listdir(traj_path)):
                            if img_name != "images0":       # Only consider one camera angle
                                continue

                            img_folder_path = os.path.join(traj_path, img_name)
                            if not os.path.isdir(img_folder_path):
                                print("img_folder_path does not exist for ", img_folder_path)
                                continue

                            ############################################ Main Process ####################################################

                            # # First Sanity check (Make sure the input source is jpg good)
                            # length = len(os.listdir(img_folder_path))
                            # status = True
                            # for check_idx in range(length):
                            #     if not os.path.exists(os.path.join(img_folder_path, 'im_' + str(check_idx) + '.jpg')):  # Should be sequentially exists
                            #         status = False
                            #         break

                            # Now we can copy the folder to our destination
                            target_lists.append(img_folder_path)
                            if copyfile:
                                print("img_folder_path[prefix_len:] is ", img_folder_path[prefix_len:])
                                if img_folder_path[prefix_len:] in test_dataset_lists:
                                    # Store to test set
                                    target_dir = os.path.join(test_store_path, str(start_idx))
                                else:
                                    # This is training set
                                    target_dir = os.path.join(train_store_path, str(start_idx))
                                
                                # Now we can copy the folder to our destination
                                print("Copy " + str(img_folder_path) + " to " + str(os.path.join(train_store_path, str(start_idx))))
                                shutil.copytree(img_folder_path, target_dir)
                                
                                # Sanity check
                                length = len(os.listdir(target_dir))
                                status = True
                                for check_idx in range(length):
                                    if not os.path.exists(os.path.join(target_dir, 'im_' + str(check_idx) + '.jpg' )):    # Should be sequentially exists
                                        status = False
                                        break
                                
                                if not status:
                                    # If they didn't have sequential files we need, we will remove and begin again without updating start_idx
                                    print("This file cannot pass the sanity check. We will remove it!")
                                    shutil.rmtree(target_dir)
                                    continue
                                
                                # Move other auxilary files
                                shutil.copy(policy_out_file_path, os.path.join(target_dir, "policy_out.pkl"))
                                shutil.copy(lang_txt_file_path, os.path.join(target_dir, "lang.txt"))

                            # Update the idx
                            start_idx += 1

    print("We have ", start_idx)
    
    # Return a list of file path
    return target_lists



if __name__ == "__main__":
    dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2"
    train_store_path = "../sanity_check/bridge_v2_raw"
    test_store_path = "../sanity_check/bridge_v2_test_raw"
    test_dataset_predefined_path = "test_path_v2.txt"


    # Make dir if needed
    if os.path.exists(train_store_path):
        shutil.rmtree(train_store_path)
    os.makedirs(train_store_path)
    if os.path.exists(test_store_path):
        shutil.rmtree(test_store_path)
    os.makedirs(test_store_path)

    # Read Test dataset path
    test_dataset_lists = []
    read_file = open(test_dataset_predefined_path, "r")
    for line in read_file.readlines():
        test_dataset_lists.append(line[:-1])
    print("test_dataset_lists is ", test_dataset_lists)


    read_bridge_v2(dataset_path, train_store_path, test_store_path, test_dataset_lists)