import json from collections import defaultdict original_json = json.load(open("mscoco_dataset/new_annotations/dataset_coco.json")) subsets = ['train', 'val', 'test'] savepath = "mscoco_dataset/new_annotations" import os if not os.path.exists(savepath): os.makedirs(savepath) savename = { 'train': "captions_train113k.json", 'val': "captions_val5k.json", 'test': "captions_test5k.json", } imagefields = defaultdict(list) annotationsfields = defaultdict(list) for imagecaps in original_json['images']: filepath = imagecaps['filepath'] filename = imagecaps['filename'] image_id = int(filename.split(".")[0].split('_')[-1]) split = imagecaps['split'] if split == 'restval': split = 'train' imagefields[split].append({ "file_name": filename, "file_path": filepath, "id": image_id }) for sen in imagecaps['sentences']: annotationsfields[split].append({ "image_id": image_id, "id": sen["sentid"], "caption": sen["raw"], }) for subset in subsets: data = { "images": imagefields[subset], "annotations": annotationsfields[subset] } json.dump(data, open(os.path.join(savepath, savename[subset]), "w")) pass