pranavSIT's picture
added pali inference
74e8f2f
# Copyright 2024 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=line-too-long
r"""Generates GQA in a TFDS-ready structure, using Beam.
Instructions below are to generate the dataset with a *local* Beam pipeline.
It's advisable to run the Beam job on Google Cloud Dataflow, see
https://www.tensorflow.org/datasets/beam_datasets.
for more details, which would significantly speed up generation. This would
involve uploading the locally downloaded data to a GCS bucket, and then
adding in the Beam pipeline options and your GCP/GCS bucket details
to the `tfds build` command below (as detailed in the link).
First, copy the data to local disk:
mkdir -p /tmp/data/gqa
wget -O /tmp/data/gqa/question1.2.zip https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip?download=true
unzip /tmp/data/gqa/question1.2.zip
mv /tmp/data/gqa/question1.2/* /tmp/data/gqa/
wget -O /tmp/data/gqa/images.zip https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip?download=true
unzip /tmp/data/gqa/images.zip
Then, run conversion (make sure to install tensorflow-datasets for the `tfds` util):
cd big_vision/datasets
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=gqa
Example to load:
import tensorflow_datasets as tfds
dataset = tfds.load('gqa', split='testdev_balanced', data_dir='/tmp/tfds')
Some statistics:
train_all: 14305356 examples
train_balanced: 943000 examples
val_all: 2011853 examples
val_balanced: 132062 examples
testdev_all: 172174 examples
testdev_balanced: 12578 examples
"""
import glob
import json
import os
import numpy as np
import tensorflow_datasets as tfds
_DESCRIPTION = """GQA: Visual Reasoning in the Real World."""
# pylint: disable=line-too-long
_CITATION = """
@article{DBLP:journals/corr/abs-2306-14610,
author = {Drew Hudson and
Christopher Manning},
title = {GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
journal = {CVPR},
volume = {abs/1902.09506},
year = {2019},
url = {https://doi.org/10.48550/arXiv.1902.09506},
doi = {10.48550/arXiv.1902.09506},
eprinttype = {arXiv},
eprint = {1902.09506},
timestamp = {Tue, 25 Jun 2019 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1902-09506},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
# pylint: enable=line-too-long
_DATA_PATH = '/tmp/data/gqa/'
class GQA(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for GQA dataset."""
VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {'1.0.0': 'First release.'}
def _info(self):
"""Returns the metadata."""
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict({
'example_id': tfds.features.Scalar(np.int64),
'image/id': tfds.features.Text(),
'image': tfds.features.Image(encoding_format='jpeg'),
'question': tfds.features.Text(),
'answer': tfds.features.Text(),
'full_answer': tfds.features.Text(),
'is_balanced': tfds.features.Scalar(np.bool_),
}),
homepage='https://cs.stanford.edu/people/dorarad/gqa/',
citation=_CITATION,
)
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
splits = [
# 'debug',
'train_all',
'train_balanced',
'testdev_all',
'testdev_balanced',
'val_all',
'val_balanced',
'challenge_all',
'challenge_balanced',
]
return {split: self._generate_examples(split) for split in splits}
def _generate_examples(self, split: str):
"""Yields (key, example) tuples from dataset."""
if split == 'train_all':
train_json_dir = os.path.join(_DATA_PATH, 'train_all_questions', '*.json')
json_files = glob.glob(train_json_dir)
else:
json_files = [os.path.join(_DATA_PATH, f'{split}_questions.json')]
def _prepare_data(json_path):
with open(os.path.join(json_path)) as f:
annotations = json.load(f)
return [(k, v) for k, v in annotations.items()]
def _process_example(entry):
question_id, question_data = entry
image_id = question_data['imageId']
image_path = os.path.join(_DATA_PATH, 'images', f'{image_id}.jpg')
answer = question_data['answer'] if 'answer' in question_data else ''
if 'fullAnswer' in question_data:
full_answer = question_data['fullAnswer']
else:
full_answer = ''
example = {
'example_id': question_id,
'image/id': image_id,
'image': image_path,
'question': question_data['question'],
'answer': answer,
'full_answer': full_answer,
'is_balanced': question_data['isBalanced'],
}
return question_id, example
beam = tfds.core.lazy_imports.apache_beam
return (
beam.Create(json_files)
| beam.FlatMap(_prepare_data)
| beam.Reshuffle()
| beam.Map(_process_example)
)