UnIVAL / preprocess /.ipynb_checkpoints /create_tsv_files-checkpoint.py
mshukor
init
26fd00c
raw
history blame
3.88 kB
from utils import get_tsv_data_from_jsons, create_imagenet_txt_files
import csv
from io import StringIO
from tqdm import tqdm
# with image conversion
# datasets = ['/data/mshukor/data/our_albef_data/json_pretrain/vg_albef.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/sbu.json',
# ]
# output_paths = ['/data/mshukor/data/ofa/pretrain_ours/vg_albef.tsv',
# '/data/mshukor/data/ofa/pretrain_ours/sbu.tsv',
# ]
# task_types = ['caption',
# 'caption']
# start_id = 566747
# for data, task_type, output_path in zip(datasets, task_types, output_paths):
# tsvs = get_tsv_data_from_jsons([data], start_id, [task_type])
# start_id = tsvs[-1][0] + 1
# print("save tsv to:", output_path)
# with open(output_path, 'w', newline='') as f_output:
# csv_output = csv.writer(f_output, delimiter='\t')
# for t in tqdm(tsvs):
# csv_output.writerow(t)
########################################################
# without image conversion
# datasets = ['/data/mshukor/data/our_albef_data/json_pretrain/coco_karp.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/vg_albef.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/sbu.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/cc3m.json']
# start_id = 0
# task_types = ['caption',
# 'caption',
# 'caption',
# 'caption']
# tsvs = get_tsv_data_from_jsons(datasets, start_id, task_types, convert_images=False)
# output_path = '/data/mshukor/data/ofa/pretrain_ours/vision_language_4m.tsv'
# with open(output_path, 'w', newline='') as f_output:
# csv_output = csv.writer(f_output, delimiter='\t')
# for t in tqdm(tsvs):
# csv_output.writerow(t)
########################################################
# datasets = [
# '/data/mshukor/data/our_albef_data/json_pretrain/coco_karp.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/vg_albef.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/sbu.json',
# '/data/mshukor/data/our_albef_data/json_pretrain/cc3m.json',
# ['/data/mshukor/data/refcoco/refcoco+/refs(unc).p', '/data/mshukor/data/refcoco/refcoco+/instances.json'],
# '/data/mshukor/data/our_albef_data/data/vqa_train.json',
# ]
# start_id = 0
# task_types = ['caption',
# 'caption',
# 'caption',
# 'caption',
# 'visual_grounding',
# 'qa',]
# tsvs = get_tsv_data_from_jsons(datasets, start_id, task_types, convert_images=False)
# output_path = '/data/mshukor/data/ofa/pretrain_ours/vision_language_mini.tsv'
# with open(output_path, 'w', newline='') as f_output:
# csv_output = csv.writer(f_output, delimiter='\t')
# for t in tqdm(tsvs):
# csv_output.writerow(t)
#### imagenet
path_data = '/data/mshukor/data/imagenet/val'
output_path = '/data/mshukor/data/ofa/pretrain_ours/imagenet_val.txt'
create_imagenet_txt_files(path_data, output_path)
####### object detection
from preprocess.utils import get_tsv_data_from_jsons
datasets = [
['coco', '/data/mshukor/data/coco/annotations/instances_train2014.json'],
['vg', '/data/mshukor/data/visual_genome/annotations/objects.json', '/data/mshukor/data/visual_genome/images'],
]
start_id = 0
task_types = ['detection',
'detection',]
tsvs = get_tsv_data_from_jsons(datasets, start_id, task_types, convert_images=False)
output_path = '/data/mshukor/data/ofa/pretrain_ours/detection_mini.tsv'
with open(output_path, 'w', newline='') as f_output:
csv_output = csv.writer(f_output, delimiter='\t')
for t in tqdm(tsvs):
csv_output.writerow(t)