|
import json |
|
from collections import defaultdict |
|
|
|
original_json = json.load(open("mscoco_dataset/new_annotations/dataset_coco.json")) |
|
|
|
subsets = ['train', 'val', 'test'] |
|
savepath = "mscoco_dataset/new_annotations" |
|
|
|
import os |
|
if not os.path.exists(savepath): |
|
os.makedirs(savepath) |
|
|
|
savename = { |
|
'train': "captions_train113k.json", |
|
'val': "captions_val5k.json", |
|
'test': "captions_test5k.json", |
|
} |
|
|
|
imagefields = defaultdict(list) |
|
annotationsfields = defaultdict(list) |
|
|
|
for imagecaps in original_json['images']: |
|
filepath = imagecaps['filepath'] |
|
filename = imagecaps['filename'] |
|
image_id = int(filename.split(".")[0].split('_')[-1]) |
|
split = imagecaps['split'] |
|
if split == 'restval': |
|
split = 'train' |
|
imagefields[split].append({ |
|
"file_name": filename, |
|
"file_path": filepath, |
|
"id": image_id |
|
}) |
|
for sen in imagecaps['sentences']: |
|
annotationsfields[split].append({ |
|
"image_id": image_id, |
|
"id": sen["sentid"], |
|
"caption": sen["raw"], |
|
}) |
|
|
|
for subset in subsets: |
|
data = { |
|
"images": imagefields[subset], |
|
"annotations": annotationsfields[subset] |
|
} |
|
json.dump(data, open(os.path.join(savepath, savename[subset]), "w")) |
|
pass |
|
|
|
|
|
|
|
|