pranavSIT's picture
added pali inference
74e8f2f
# Copyright 2024 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=line-too-long
r"""Creates TFDS dataset for SciCap.
Preparing the data:
1) mkdir /tmp/data/scicap && cd /tmp/data/scicap
2) wget 'https://www.dropbox.com/s/t1sjqesl0pynaxo/scicap_data.zip?dl=0'
3) unzip -UU 'scicap_data.zip?dl=0' && rm 'scicap_data.zip?dl=0'
Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
cd big_vision/datasets
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=scicap
Example to load:
import tensorflow_datasets as tfds
dataset = tfds.load('scicap', split='train', data_dir='/tmp/tfds')
"""
# pylint: enable=line-too-long
import enum
import functools
import json
import os
import tensorflow_datasets as tfds
_DESCRIPTION = """SciCap dataset."""
_CITATION = """
@article{hsu2021scicap,
title={SciCap: Generating captions for scientific figures},
author={Hsu, Ting-Yao and Giles, C Lee and Huang, Ting-Hao'Kenneth'},
journal={arXiv preprint arXiv:2110.11624},
year={2021}
}
"""
# When running locally (recommended), copy files as above an use these:
_SCICAP_DIR = "/tmp/data/scicap/scicap_data"
class ScicapSubset(enum.Enum):
"""Versions of the SciCap dataset."""
SINGLE_SENTENCE = "single_sentence"
FIRST_SENTENCE = "first_sentence"
LEQ_100_TOKENS = "leq_100_tokens"
_SPLITS_TO_GENERATE = ["train", "test", "val"]
_CONFIG_TO_IDS_PATH = {
(ScicapSubset.SINGLE_SENTENCE, True): "Single-Sentence-Caption/Yes-Subfig",
(ScicapSubset.SINGLE_SENTENCE, False): "Single-Sentence-Caption/No-Subfig",
(ScicapSubset.FIRST_SENTENCE, True): "First-Sentence/Yes-Subfig",
(ScicapSubset.FIRST_SENTENCE, False): "First-Sentence/No-Subfig",
(ScicapSubset.LEQ_100_TOKENS, True):
"Caption-No-More-Than-100-Tokens/Yes-Subfig",
(ScicapSubset.LEQ_100_TOKENS, False):
"Caption-No-More-Than-100-Tokens/No-Subfig",
}
_SUBFIG_TO_PATH = {
True: "SciCap-Yes-Subfig-Img", False: "SciCap-No-Subfig-Img"
}
class ScicapConfig(tfds.core.BuilderConfig):
""""Configuration for SciCap caption length and subfigure inclusion."""
def __init__(self, *, subset: ScicapSubset, subfig: bool, **kwargs):
"""Parameters specifying how the dataset will be processed.
Args:
subset: Subset of the Scicap data (see enum above).
subfig: Whether or not figure with subfigures are included.
**kwargs: Passed on to the constructor of `BuilderConfig`.
"""
super(ScicapConfig, self).__init__(**kwargs)
self.subset = subset
self.subfig = subfig
@functools.cache
def _read_annotations(split: str, image_id: str):
"""Reads annotations for a single file."""
path = os.path.join(_SCICAP_DIR, "SciCap-Caption-All", split)
fname = os.path.join(path, image_id + ".json")
with open(fname, "r") as fin:
return json.load(fin)
class Scicap(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for the SciCap dataset."""
VERSION = tfds.core.Version("1.0.0")
RELEASE_NOTES = {"1.0.0": "First release."}
BUILDER_CONFIGS = [
ScicapConfig(
name="single_sentence_subfig_yes",
description="Single sentence caption with subfigures allowed.",
subset=ScicapSubset.SINGLE_SENTENCE,
subfig=True
),
ScicapConfig(
name="single_sentence_subfig_no",
description="Single sentence caption with subfigures not allowed.",
subset=ScicapSubset.SINGLE_SENTENCE,
subfig=False
),
ScicapConfig(
name="first_sentence_subfig_yes",
description="First sentence of captions with subfigures allowed.",
subset=ScicapSubset.FIRST_SENTENCE,
subfig=True
),
ScicapConfig(
name="first_sentence_subfig_no",
description="First sentence of captions with subfigures not allowed.",
subset=ScicapSubset.FIRST_SENTENCE,
subfig=False
),
ScicapConfig(
name="leq_100_tokens_subfig_yes",
description="Captions with <= 100 tokens with subfigures allowed.",
subset=ScicapSubset.LEQ_100_TOKENS,
subfig=True
),
ScicapConfig(
name="leq_100_tokens_subfig_no",
description=("Captions with <= 100 tokens with subfigures"
" not allowed."),
subset=ScicapSubset.LEQ_100_TOKENS,
subfig=False
),
]
def _info(self):
"""Returns the metadata."""
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict({
"image/id": tfds.features.Text(),
"image/filename": tfds.features.Text(),
"image": tfds.features.Image(encoding_format="png"),
"caption/originally_extracted": tfds.features.Text(),
"caption/lowercase_and_token_and_remove_figure_index":
tfds.features.Text(),
"caption/normalized/basic_num": tfds.features.Text(),
"caption/normalized/advanced_equation_bracket":
tfds.features.Text(),
}),
supervised_keys=None,
homepage="https://github.com/tingyaohsu/SciCap",
citation=_CITATION,
)
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
return {split: self._generate_examples(split)
for split in _SPLITS_TO_GENERATE}
def _generate_examples(self, split: str):
"""Yields (key, example) tuples from test set."""
config_path = _CONFIG_TO_IDS_PATH[
(self.builder_config.subset, self.builder_config.subfig)]
image_path = os.path.join(
_SCICAP_DIR, _SUBFIG_TO_PATH[self.builder_config.subfig], split)
id_list_fname = os.path.join(
_SCICAP_DIR, "List-of-Files-for-Each-Experiments",
config_path, split, "file_idx.json")
with open(id_list_fname, "r") as fin:
split_images = json.load(fin)
for fname in split_images:
assert fname.endswith(".png")
image_id = fname[:-len(".png")]
annotations = _read_annotations(split, image_id)
yield fname, {
"image/id": image_id,
"image/filename": fname,
"image": os.path.join(image_path, fname),
"caption/originally_extracted": annotations["0-originally-extracted"],
"caption/lowercase_and_token_and_remove_figure_index":
annotations["1-lowercase-and-token-and-remove-figure-index"][
"caption"],
"caption/normalized/basic_num": annotations["2-normalized"][
"2-1-basic-num"]["caption"],
"caption/normalized/advanced_equation_bracket":
annotations["2-normalized"][
"2-2-advanced-euqation-bracket"]["caption"]
}