File size: 7,641 Bytes
74e8f2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
# Copyright 2024 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=line-too-long
r"""Implements the OKVQA dataset for TFDS.
Download the required files from https://okvqa.allenai.org/download.html:
mkdir -p /tmp/tfds
cd /tmp/tfds/
wget http://images.cocodataset.org/zips/train2014.zip
wget http://images.cocodataset.org/zips/val2014.zip
wget https://okvqa.allenai.org/static/data/mscoco_train2014_annotations.json.zip
wget https://okvqa.allenai.org/static/data/mscoco_val2014_annotations.json.zip
wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_train2014_questions.json.zip
wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_val2014_questions.json.zip
unzip val2014.zip
unzip train2014.zip
unzip OpenEnded_mscoco_train2014_questions.json.zip
unzip OpenEnded_mscoco_val2014_questions.json.zip
unzip mscoco_train2014_annotations.json.zip
unzip mscoco_val2014_annotations.json.zip
Then, run conversion locally (make sure to install tensorflow-datasets for the
`tfds` util):
cd big_vision/datasets
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=okvqa
Example to load:
import tensorflow_datasets as tfds
dataset = tfds.load('okvqa', split='val', data_dir='/tmp/tfds')
"""
import json
import os
from typing import Any
import numpy as np
import tensorflow_datasets as tfds
_DESCRIPTION = """
OKVQA addresses the task of VQA with outside knowledge.
This version of the dataset contains:
- Questions + Answers from OKVQA.
- Images from COCO.
"""
_CITATION = """
@InProceedings{okvqa,
author = {Kenneth Marino and Mohammad Rastegari and Ali Farhadi and Roozbeh Mottaghi},
title = {OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge},
booktitle = {Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2019},
}
"""
ANNOTATION_FILE = {
'train': 'mscoco_train2014_annotations.json',
'val': 'mscoco_val2014_annotations.json',
}
QUESTIONS_FILE = {
'train': 'OpenEnded_mscoco_train2014_questions.json',
'val': 'OpenEnded_mscoco_val2014_questions.json',
}
QUESTION_TYPES = {
'one': 'Vehicles and Transportation',
'two': 'Brands, Companies and Products',
'three': 'Objects, Material and Clothing',
'four': 'Sports and Recreation',
'five': 'Cooking and Food',
'six': 'Geography, History, Language and Culture',
'seven': 'People and Everyday life',
'eight': 'Plants and Animals',
'nine': 'Science and Technology',
'ten': 'Weather and Climate',
'other': 'Other',
}
# When running locally (recommended), copy files as above an use these:
_OKVQA_PATH = '/media/scratch/okvqa'
class OkVqa(tfds.core.GeneratorBasedBuilder):
"""Import COCO dataset for OKVQA with KAT features."""
VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {'1.0.0': 'Changed to array record format.'}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
In manual_dir/ you should have a directory okvqa which contains the
following files and directories:
From the OKVQA dataset:
- mscoco_train2014_annotations.json
- mscoco_val2014_annotations.json
- OpenEnded_mscoco_train2014_questions.json
- OpenEnded_mscoco_val2014_questions.json
- train2014.zip
- val2014.zip
"""
def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
features = tfds.features.FeaturesDict({
'image': tfds.features.Image(shape=(None, None, 3)),
'image_id': tfds.features.Scalar(dtype=np.int64),
'answer_type': tfds.features.Text(),
'answers': tfds.features.Sequence(tfds.features.Text()),
'answers_confidence': tfds.features.Tensor(shape=[10], dtype=np.bool_),
'answers_raw': tfds.features.Sequence(tfds.features.Text()),
'question_id': tfds.features.Scalar(dtype=np.int64),
'question_type': tfds.features.Text(),
'question_type_readable': tfds.features.Text(),
'question': tfds.features.Text(),
})
return tfds.core.DatasetInfo(
builder=self,
features=features,
description=_DESCRIPTION,
supervised_keys=None,
homepage='https://okvqa.allenai.org/',
citation=_CITATION,
)
def _split_generators(self, dl_manager: tfds.download.DownloadManager) -> ...:
"""Call the function which defines the splits."""
# data_dir = dl_manager.manual_dir
data_dir = _OKVQA_PATH
return {
'train': self._generate_examples(data_dir, 'train'),
'val': self._generate_examples(data_dir, 'val'),
}
def _generate_examples(self, data_dir: str, split: str) -> ...:
annotations = get_okvqa_annotations(data_dir, split)
for question_id, annotation in annotations.items():
image_id = annotation['image_id']
# Sanity check.
if len(annotation['answers']) != 10:
num_answers = len(annotation['answers'])
raise ValueError(
f'The number of answers for {image_id} is not 10 but {num_answers}')
feature_dict = {
'image': self.get_image_path(data_dir, split, image_id),
'image_id': image_id,
'answer_type': annotation['answer_type'],
'answers': [a['answer'] for a in annotation['answers']],
'answers_confidence': _get_answer_confidence(annotation['answers']),
'answers_raw': [a['raw_answer'] for a in annotation['answers']],
'question_id': annotation['question_id'],
'question_type': annotation['question_type'],
'question_type_readable': QUESTION_TYPES[annotation['question_type']],
'question': annotation['question'],
}
yield f'{question_id}', feature_dict
def get_image_path(self, data_dir: str, split: str, image_id: int) -> str:
subdir = {'train': 'train2014', 'val': 'val2014'}[split]
return f'{data_dir}/{subdir}/COCO_{subdir}_{image_id:012d}.jpg'
def _get_answer_confidence(answers: list[dict[str, str]]) -> np.ndarray:
"""Get OKVQA answer confidences as bool."""
confidences = []
for a in answers:
confidence = a['answer_confidence']
if confidence == 'yes':
confidences.append(True)
elif confidence == 'no':
confidences.append(False)
else:
raise ValueError(f'Unknown confidence: {confidence}')
return np.array(confidences, dtype=bool)
def _read_json(
data_dir: str, file: str, key: str
) -> dict[int, dict[str, Any]]:
with open(os.path.join(data_dir, file)) as f:
data = json.load(f)
questions = {d['question_id']: d for d in data[key]}
return questions
def get_okvqa_annotations(
data_dir: str, split: str
) -> dict[int, dict[str, Any]]:
"""Return okvqa annotations (quesions and answers) as dictionary."""
questions = _read_json(data_dir, QUESTIONS_FILE[split], 'questions')
annotations = _read_json(data_dir, ANNOTATION_FILE[split], 'annotations')
assert len(annotations) == len(questions)
for question_id, question in questions.items():
assert question['image_id'] == annotations[question_id]['image_id']
assert question['question_id'] == annotations[question_id]['question_id']
annotations[question_id]['question'] = question['question']
return annotations
|