captions = [] urls = [] with open('Train_GCC-training.tsv') as fp: for cnt, line in enumerate(fp): s = line.split('\t') captions.append(s[0].split(' ')) urls.append(s[1][:-1]) valids = set([]) with open('train_valid.txt') as fp: for cnt, line in enumerate(fp): valids.add(line[:-1]) import json with open('train.json', 'w') as outfile: for cnt, (cap, url) in enumerate(zip(captions, urls)): im = "{:08d}.jpg".format(cnt) if (im in valids): d = {'image':"train_image.zip@/{}".format(im), 'caption':cap} json.dump(d, outfile) outfile.write('\n') import json with open('train_frcnn.json', 'w') as outfile: for cnt, (cap, url) in enumerate(zip(captions, urls)): im = "{:08d}.jpg".format(cnt) if (im in valids): d = {'image':"train_image.zip@/{}".format(im), 'caption':cap, 'frcnn':"train_frcnn.zip@/{:08d}.json".format(cnt)} json.dump(d, outfile) outfile.write('\n')