File size: 3,882 Bytes
26fd00c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from utils import get_tsv_data_from_jsons, create_imagenet_txt_files
import csv
from io import StringIO
from tqdm import tqdm
# with image conversion
# datasets = ['/data/mshukor/data/our_albef_data/json_pretrain/vg_albef.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/sbu.json',
# ]
# output_paths = ['/data/mshukor/data/ofa/pretrain_ours/vg_albef.tsv',
# '/data/mshukor/data/ofa/pretrain_ours/sbu.tsv',
# ]
# task_types = ['caption',
# 'caption']
# start_id = 566747
# for data, task_type, output_path in zip(datasets, task_types, output_paths):
# tsvs = get_tsv_data_from_jsons([data], start_id, [task_type])
# start_id = tsvs[-1][0] + 1
# print("save tsv to:", output_path)
# with open(output_path, 'w', newline='') as f_output:
# csv_output = csv.writer(f_output, delimiter='\t')
# for t in tqdm(tsvs):
# csv_output.writerow(t)
########################################################
# without image conversion
# datasets = ['/data/mshukor/data/our_albef_data/json_pretrain/coco_karp.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/vg_albef.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/sbu.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/cc3m.json']
# start_id = 0
# task_types = ['caption',
# 'caption',
# 'caption',
# 'caption']
# tsvs = get_tsv_data_from_jsons(datasets, start_id, task_types, convert_images=False)
# output_path = '/data/mshukor/data/ofa/pretrain_ours/vision_language_4m.tsv'
# with open(output_path, 'w', newline='') as f_output:
# csv_output = csv.writer(f_output, delimiter='\t')
# for t in tqdm(tsvs):
# csv_output.writerow(t)
########################################################
# datasets = [
# '/data/mshukor/data/our_albef_data/json_pretrain/coco_karp.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/vg_albef.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/sbu.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/cc3m.json',
# ['/data/mshukor/data/refcoco/refcoco+/refs(unc).p', '/data/mshukor/data/refcoco/refcoco+/instances.json'],
# '/data/mshukor/data/our_albef_data/data/vqa_train.json',
# ]
# start_id = 0
# task_types = ['caption',
# 'caption',
# 'caption',
# 'caption',
# 'visual_grounding',
# 'qa',]
# tsvs = get_tsv_data_from_jsons(datasets, start_id, task_types, convert_images=False)
# output_path = '/data/mshukor/data/ofa/pretrain_ours/vision_language_mini.tsv'
# with open(output_path, 'w', newline='') as f_output:
# csv_output = csv.writer(f_output, delimiter='\t')
# for t in tqdm(tsvs):
# csv_output.writerow(t)
#### imagenet
path_data = '/data/mshukor/data/imagenet/val'
output_path = '/data/mshukor/data/ofa/pretrain_ours/imagenet_val.txt'
create_imagenet_txt_files(path_data, output_path)
####### object detection
from preprocess.utils import get_tsv_data_from_jsons
datasets = [
['coco', '/data/mshukor/data/coco/annotations/instances_train2014.json'],
['vg', '/data/mshukor/data/visual_genome/annotations/objects.json', '/data/mshukor/data/visual_genome/images'],
]
start_id = 0
task_types = ['detection',
'detection',]
tsvs = get_tsv_data_from_jsons(datasets, start_id, task_types, convert_images=False)
output_path = '/data/mshukor/data/ofa/pretrain_ours/detection_mini.tsv'
with open(output_path, 'w', newline='') as f_output:
csv_output = csv.writer(f_output, delimiter='\t')
for t in tqdm(tsvs):
csv_output.writerow(t) |