|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Generates XM3600 in a TFDS-ready structure.
|
|
|
|
First, download the captions from https://google.github.io/crossmodal-3600/ and the images from https://cocodataset.org/#download.
|
|
The coco Karpathy split is available at http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip:
|
|
mkdir -p /tmp/data/xm3600
|
|
wget https://google.github.io/crossmodal-3600/web-data/captions.zip -P /tmp/data/xm3600
|
|
unzip /tmp/data/xm3600/captions.zip -d /tmp/data/xm3600/
|
|
wget https://open-images-dataset.s3.amazonaws.com/crossmodal-3600/images.tgz ta-P /tmp/data/xm3600
|
|
mkdir /tmp/data/xm3600/images
|
|
tar -xzf /tmp/data/xm3600/images.tgz -C /tmp/data/xm3600/images
|
|
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
|
|
|
|
cd big_vision/datasets
|
|
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=xm3600
|
|
|
|
Example to load:
|
|
|
|
import tensorflow_datasets as tfds
|
|
dataset = tfds.load(
|
|
'xm3600', split='en',
|
|
data_dir='/tmp/tfds')
|
|
"""
|
|
|
|
import json
|
|
import os.path
|
|
|
|
import tensorflow_datasets as tfds
|
|
|
|
_DESCRIPTION = """
|
|
COCO image + captions, translated from English to 35 languages (English incl.).
|
|
"""
|
|
|
|
|
|
_CITATION = """
|
|
@inproceedings{thapliyal-etal-2022-crossmodal,
|
|
title = "Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset",
|
|
author = "Thapliyal, Ashish V. and
|
|
Pont Tuset, Jordi and
|
|
Chen, Xi and
|
|
Soricut, Radu",
|
|
editor = "Goldberg, Yoav and
|
|
Kozareva, Zornitsa and
|
|
Zhang, Yue",
|
|
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
|
|
month = dec,
|
|
year = "2022",
|
|
address = "Abu Dhabi, United Arab Emirates",
|
|
publisher = "Association for Computational Linguistics",
|
|
url = "https://aclanthology.org/2022.emnlp-main.45",
|
|
doi = "10.18653/v1/2022.emnlp-main.45",
|
|
pages = "715--729",
|
|
}
|
|
"""
|
|
|
|
|
|
|
|
_CAPTIONS_PATH = '/tmp/data/xm3600'
|
|
_IMAGES_PATH = '/tmp/data/xm3600/images'
|
|
|
|
XM3600_LANGUAGES = [
|
|
'ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr',
|
|
'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl',
|
|
'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh'
|
|
]
|
|
|
|
|
|
class Xm3600(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for XM3600 dataset."""
|
|
|
|
VERSION = tfds.core.Version('1.0.1')
|
|
RELEASE_NOTES = {
|
|
'1.0.0': 'First release.',
|
|
'1.0.1': 'Add captions/tokenized feature to compute metrics (eg CIDEr).',
|
|
}
|
|
|
|
def _info(self):
|
|
"""Returns the metadata."""
|
|
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
description=_DESCRIPTION,
|
|
features=tfds.features.FeaturesDict({
|
|
'image/id': tfds.features.Text(),
|
|
'image': tfds.features.Image(encoding_format='jpeg'),
|
|
'captions': tfds.features.Sequence(tfds.features.Text()),
|
|
'captions/tokenized': tfds.features.Sequence(tfds.features.Text()),
|
|
'language': tfds.features.Text(),
|
|
}),
|
|
supervised_keys=None,
|
|
homepage='https://google.github.io/crossmodal-3600/',
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
"""Returns SplitGenerators."""
|
|
return {lang: self._generate_examples(lang) for lang in XM3600_LANGUAGES}
|
|
|
|
def _generate_examples(self, split: str):
|
|
"""Yields (key, example) tuples from dataset."""
|
|
language = split
|
|
|
|
annot_fname = os.path.join(_CAPTIONS_PATH, 'captions.jsonl')
|
|
data = {}
|
|
tok_data = {}
|
|
with open(annot_fname, 'r') as f:
|
|
for line in f:
|
|
j = json.loads(line)
|
|
image_id = f'{j["image/key"]}_{language}'
|
|
captions = j[language]['caption']
|
|
data[image_id] = captions
|
|
tok_data[image_id] = j[language]['caption/tokenized']
|
|
|
|
for image_id, captions in data.items():
|
|
yield image_id, {
|
|
'image/id': image_id,
|
|
'image': os.path.join(_IMAGES_PATH, f'{image_id.split("_")[0]}.jpg'),
|
|
'captions': captions,
|
|
'captions/tokenized': tok_data[image_id],
|
|
'language': language,
|
|
}
|
|
|