|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Creates TFDS dataset for SciCap.
|
|
|
|
Preparing the data:
|
|
1) mkdir /tmp/data/scicap && cd /tmp/data/scicap
|
|
2) wget 'https://www.dropbox.com/s/t1sjqesl0pynaxo/scicap_data.zip?dl=0'
|
|
3) unzip -UU 'scicap_data.zip?dl=0' && rm 'scicap_data.zip?dl=0'
|
|
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
|
|
|
|
cd big_vision/datasets
|
|
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=scicap
|
|
|
|
Example to load:
|
|
|
|
import tensorflow_datasets as tfds
|
|
dataset = tfds.load('scicap', split='train', data_dir='/tmp/tfds')
|
|
"""
|
|
|
|
import enum
|
|
import functools
|
|
import json
|
|
import os
|
|
|
|
import tensorflow_datasets as tfds
|
|
|
|
|
|
_DESCRIPTION = """SciCap dataset."""
|
|
_CITATION = """
|
|
@article{hsu2021scicap,
|
|
title={SciCap: Generating captions for scientific figures},
|
|
author={Hsu, Ting-Yao and Giles, C Lee and Huang, Ting-Hao'Kenneth'},
|
|
journal={arXiv preprint arXiv:2110.11624},
|
|
year={2021}
|
|
}
|
|
"""
|
|
|
|
|
|
_SCICAP_DIR = "/tmp/data/scicap/scicap_data"
|
|
|
|
|
|
class ScicapSubset(enum.Enum):
|
|
"""Versions of the SciCap dataset."""
|
|
SINGLE_SENTENCE = "single_sentence"
|
|
FIRST_SENTENCE = "first_sentence"
|
|
LEQ_100_TOKENS = "leq_100_tokens"
|
|
|
|
_SPLITS_TO_GENERATE = ["train", "test", "val"]
|
|
_CONFIG_TO_IDS_PATH = {
|
|
(ScicapSubset.SINGLE_SENTENCE, True): "Single-Sentence-Caption/Yes-Subfig",
|
|
(ScicapSubset.SINGLE_SENTENCE, False): "Single-Sentence-Caption/No-Subfig",
|
|
(ScicapSubset.FIRST_SENTENCE, True): "First-Sentence/Yes-Subfig",
|
|
(ScicapSubset.FIRST_SENTENCE, False): "First-Sentence/No-Subfig",
|
|
(ScicapSubset.LEQ_100_TOKENS, True):
|
|
"Caption-No-More-Than-100-Tokens/Yes-Subfig",
|
|
(ScicapSubset.LEQ_100_TOKENS, False):
|
|
"Caption-No-More-Than-100-Tokens/No-Subfig",
|
|
}
|
|
_SUBFIG_TO_PATH = {
|
|
True: "SciCap-Yes-Subfig-Img", False: "SciCap-No-Subfig-Img"
|
|
}
|
|
|
|
|
|
class ScicapConfig(tfds.core.BuilderConfig):
|
|
""""Configuration for SciCap caption length and subfigure inclusion."""
|
|
|
|
def __init__(self, *, subset: ScicapSubset, subfig: bool, **kwargs):
|
|
"""Parameters specifying how the dataset will be processed.
|
|
|
|
Args:
|
|
subset: Subset of the Scicap data (see enum above).
|
|
subfig: Whether or not figure with subfigures are included.
|
|
**kwargs: Passed on to the constructor of `BuilderConfig`.
|
|
"""
|
|
super(ScicapConfig, self).__init__(**kwargs)
|
|
self.subset = subset
|
|
self.subfig = subfig
|
|
|
|
|
|
@functools.cache
|
|
def _read_annotations(split: str, image_id: str):
|
|
"""Reads annotations for a single file."""
|
|
path = os.path.join(_SCICAP_DIR, "SciCap-Caption-All", split)
|
|
fname = os.path.join(path, image_id + ".json")
|
|
with open(fname, "r") as fin:
|
|
return json.load(fin)
|
|
|
|
|
|
class Scicap(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for the SciCap dataset."""
|
|
|
|
VERSION = tfds.core.Version("1.0.0")
|
|
RELEASE_NOTES = {"1.0.0": "First release."}
|
|
|
|
BUILDER_CONFIGS = [
|
|
ScicapConfig(
|
|
name="single_sentence_subfig_yes",
|
|
description="Single sentence caption with subfigures allowed.",
|
|
subset=ScicapSubset.SINGLE_SENTENCE,
|
|
subfig=True
|
|
),
|
|
ScicapConfig(
|
|
name="single_sentence_subfig_no",
|
|
description="Single sentence caption with subfigures not allowed.",
|
|
subset=ScicapSubset.SINGLE_SENTENCE,
|
|
subfig=False
|
|
),
|
|
ScicapConfig(
|
|
name="first_sentence_subfig_yes",
|
|
description="First sentence of captions with subfigures allowed.",
|
|
subset=ScicapSubset.FIRST_SENTENCE,
|
|
subfig=True
|
|
),
|
|
ScicapConfig(
|
|
name="first_sentence_subfig_no",
|
|
description="First sentence of captions with subfigures not allowed.",
|
|
subset=ScicapSubset.FIRST_SENTENCE,
|
|
subfig=False
|
|
),
|
|
ScicapConfig(
|
|
name="leq_100_tokens_subfig_yes",
|
|
description="Captions with <= 100 tokens with subfigures allowed.",
|
|
subset=ScicapSubset.LEQ_100_TOKENS,
|
|
subfig=True
|
|
),
|
|
ScicapConfig(
|
|
name="leq_100_tokens_subfig_no",
|
|
description=("Captions with <= 100 tokens with subfigures"
|
|
" not allowed."),
|
|
subset=ScicapSubset.LEQ_100_TOKENS,
|
|
subfig=False
|
|
),
|
|
]
|
|
|
|
def _info(self):
|
|
"""Returns the metadata."""
|
|
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
description=_DESCRIPTION,
|
|
features=tfds.features.FeaturesDict({
|
|
"image/id": tfds.features.Text(),
|
|
"image/filename": tfds.features.Text(),
|
|
"image": tfds.features.Image(encoding_format="png"),
|
|
"caption/originally_extracted": tfds.features.Text(),
|
|
"caption/lowercase_and_token_and_remove_figure_index":
|
|
tfds.features.Text(),
|
|
"caption/normalized/basic_num": tfds.features.Text(),
|
|
"caption/normalized/advanced_equation_bracket":
|
|
tfds.features.Text(),
|
|
}),
|
|
supervised_keys=None,
|
|
homepage="https://github.com/tingyaohsu/SciCap",
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
"""Returns SplitGenerators."""
|
|
return {split: self._generate_examples(split)
|
|
for split in _SPLITS_TO_GENERATE}
|
|
|
|
def _generate_examples(self, split: str):
|
|
"""Yields (key, example) tuples from test set."""
|
|
config_path = _CONFIG_TO_IDS_PATH[
|
|
(self.builder_config.subset, self.builder_config.subfig)]
|
|
image_path = os.path.join(
|
|
_SCICAP_DIR, _SUBFIG_TO_PATH[self.builder_config.subfig], split)
|
|
id_list_fname = os.path.join(
|
|
_SCICAP_DIR, "List-of-Files-for-Each-Experiments",
|
|
config_path, split, "file_idx.json")
|
|
with open(id_list_fname, "r") as fin:
|
|
split_images = json.load(fin)
|
|
|
|
for fname in split_images:
|
|
assert fname.endswith(".png")
|
|
image_id = fname[:-len(".png")]
|
|
annotations = _read_annotations(split, image_id)
|
|
yield fname, {
|
|
"image/id": image_id,
|
|
"image/filename": fname,
|
|
"image": os.path.join(image_path, fname),
|
|
"caption/originally_extracted": annotations["0-originally-extracted"],
|
|
"caption/lowercase_and_token_and_remove_figure_index":
|
|
annotations["1-lowercase-and-token-and-remove-figure-index"][
|
|
"caption"],
|
|
"caption/normalized/basic_num": annotations["2-normalized"][
|
|
"2-1-basic-num"]["caption"],
|
|
"caption/normalized/advanced_equation_bracket":
|
|
annotations["2-normalized"][
|
|
"2-2-advanced-euqation-bracket"]["caption"]
|
|
}
|
|
|