import json import pandas as pd import pyarrow as pa import random import os from tqdm import tqdm from glob import glob from collections import defaultdict, Counter from .glossary import normalize_word def get_score(occurences): if occurences == 0: return 0.0 elif occurences == 1: return 0.3 elif occurences == 2: return 0.6 elif occurences == 3: return 0.9 else: return 1.0 def path2rest(path, split, annotations, label2ans): iid = int(path.split("/")[-1].split("_")[-1][:-4]) with open(path, "rb") as fp: binary = fp.read() _annot = annotations[split][iid] _annot = list(_annot.items()) qids, qas = [a[0] for a in _annot], [a[1] for a in _annot] questions = [qa[0] for qa in qas] answers = [qa[1] for qa in qas] if "test" not in split else list(list()) answer_labels = ( [a["labels"] for a in answers] if "test" not in split else list(list()) ) answer_scores = ( [a["scores"] for a in answers] if "test" not in split else list(list()) ) answers = ( [[label2ans[l] for l in al] for al in answer_labels] if "test" not in split else list(list()) ) return [binary, questions, answers, answer_labels, answer_scores, iid, qids, split] def make_arrow(root, dataset_root): with open(f"{root}/v2_OpenEnded_mscoco_train2014_questions.json", "r") as fp: questions_train2014 = json.load(fp)["questions"] with open(f"{root}/v2_OpenEnded_mscoco_val2014_questions.json", "r") as fp: questions_val2014 = json.load(fp)["questions"] with open(f"{root}/v2_OpenEnded_mscoco_test2015_questions.json", "r") as fp: questions_test2015 = json.load(fp)["questions"] with open(f"{root}/v2_OpenEnded_mscoco_test-dev2015_questions.json", "r") as fp: questions_test_dev2015 = json.load(fp)["questions"] with open(f"{root}/v2_mscoco_train2014_annotations.json", "r") as fp: annotations_train2014 = json.load(fp)["annotations"] with open(f"{root}/v2_mscoco_val2014_annotations.json", "r") as fp: annotations_val2014 = json.load(fp)["annotations"] annotations = dict() for split, questions in zip( ["train", "val", "test", "test-dev"], [ questions_train2014, questions_val2014, questions_test2015, questions_test_dev2015, ], ): _annot = defaultdict(dict) for q in tqdm(questions): _annot[q["image_id"]][q["question_id"]] = [q["question"]] annotations[split] = _annot all_major_answers = list() for split, annots in zip( ["train", "val"], [annotations_train2014, annotations_val2014], ): _annot = annotations[split] for q in tqdm(annots): all_major_answers.append(q["multiple_choice_answer"]) all_major_answers = [normalize_word(word) for word in tqdm(all_major_answers)] counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9} ans2label = {k: i for i, k in enumerate(counter.keys())} label2ans = list(counter.keys()) for split, annots in zip( ["train", "val"], [annotations_train2014, annotations_val2014], ): _annot = annotations[split] for q in tqdm(annots): answers = q["answers"] answer_count = {} for answer in answers: answer_ = answer["answer"] answer_count[answer_] = answer_count.get(answer_, 0) + 1 labels = [] scores = [] for answer in answer_count: if answer not in ans2label: continue labels.append(ans2label[answer]) score = get_score(answer_count[answer]) scores.append(score) _annot[q["image_id"]][q["question_id"]].append( {"labels": labels, "scores": scores,} ) for split in ["train", "val"]: filtered_annot = dict() for ik, iv in annotations[split].items(): new_q = dict() for qk, qv in iv.items(): if len(qv[1]["labels"]) != 0: new_q[qk] = qv if len(new_q) != 0: filtered_annot[ik] = new_q annotations[split] = filtered_annot for split in [ "train", "val", "test", "test-dev", ]: annot = annotations[split] split_name = { "train": "train2014", "val": "val2014", "test": "test2015", "test-dev": "test2015", }[split] paths = list(glob(f"{root}/{split_name}/*.jpg")) random.shuffle(paths) annot_paths = [ path for path in paths if int(path.split("/")[-1].split("_")[-1][:-4]) in annot ] if len(paths) == len(annot_paths): print("all images have caption annotations") else: print("not all images have caption annotations") print( len(paths), len(annot_paths), len(annot), ) bs = [ path2rest(path, split, annotations, label2ans) for path in tqdm(annot_paths) ] dataframe = pd.DataFrame( bs, columns=[ "image", "questions", "answers", "answer_labels", "answer_scores", "image_id", "question_id", "split", ], ) table = pa.Table.from_pandas(dataframe) os.makedirs(dataset_root, exist_ok=True) with pa.OSFile(f"{dataset_root}/vqav2_{split}.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table) table = pa.ipc.RecordBatchFileReader( pa.memory_map(f"{dataset_root}/vqav2_val.arrow", "r") ).read_all() pdtable = table.to_pandas() df1 = pdtable[:-1000] df2 = pdtable[-1000:] df1 = pa.Table.from_pandas(df1) df2 = pa.Table.from_pandas(df2) with pa.OSFile(f"{dataset_root}/vqav2_trainable_val.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, df1.schema) as writer: writer.write_table(df1) with pa.OSFile(f"{dataset_root}/vqav2_rest_val.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, df2.schema) as writer: writer.write_table(df2)