|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Generates COCO-35L in a TFDS-ready structure.
|
|
|
|
First, download the captions from https://google.github.io/crossmodal-3600/ and the images from https://cocodataset.org/#download.
|
|
The coco Karpathy split is available at http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip:
|
|
mkdir -p /tmp/data/coco35l/images
|
|
wget https://storage.googleapis.com/crossmodal-3600/coco_mt_train.jsonl.bz2 -P /tmp/data/coco35l
|
|
wget https://storage.googleapis.com/crossmodal-3600/coco_mt_dev.jsonl.bz2 -P /tmp/data/coco35l
|
|
bzip2 -dk /tmp/data/coco35l/coco_mt_train.jsonl.bz2 /tmp/data/coco35l/coco_mt_dev.jsonl.bz2
|
|
wget http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip -P /tmp/data/coco35l
|
|
unzip /tmp/data/coco35l/caption_datasets.zip -d /tmp/data/coco35l/
|
|
wget http://images.cocodataset.org/zips/train2014.zip -P /tmp/data/coco35l/images
|
|
wget http://images.cocodataset.org/zips/val2014.zip -P /tmp/data/coco35l/images
|
|
unzip /tmp/data/coco35l/images/train2014.zip -d /tmp/data/coco35l/images/
|
|
unzip /tmp/data/coco35l/images/val2014.zip -d /tmp/data/coco35l/images/
|
|
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
|
|
|
|
cd big_vision/datasets
|
|
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=coco35l
|
|
|
|
Example to load:
|
|
|
|
import tensorflow_datasets as tfds
|
|
dataset = tfds.load(
|
|
'coco35l', split='dev_en',
|
|
data_dir='/tmp/tfds')
|
|
"""
|
|
|
|
import json
|
|
import os.path
|
|
|
|
import tensorflow_datasets as tfds
|
|
|
|
_DESCRIPTION = """
|
|
COCO image + captions, translated from English to 35 languages (English incl.).
|
|
"""
|
|
|
|
|
|
_CITATION = """
|
|
@inproceedings{thapliyal-etal-2022-crossmodal,
|
|
title = "Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset",
|
|
author = "Thapliyal, Ashish V. and
|
|
Pont Tuset, Jordi and
|
|
Chen, Xi and
|
|
Soricut, Radu",
|
|
editor = "Goldberg, Yoav and
|
|
Kozareva, Zornitsa and
|
|
Zhang, Yue",
|
|
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
|
|
month = dec,
|
|
year = "2022",
|
|
address = "Abu Dhabi, United Arab Emirates",
|
|
publisher = "Association for Computational Linguistics",
|
|
url = "https://aclanthology.org/2022.emnlp-main.45",
|
|
doi = "10.18653/v1/2022.emnlp-main.45",
|
|
pages = "715--729",
|
|
}
|
|
"""
|
|
|
|
|
|
|
|
_CAPTIONS_PATH = '/tmp/data/coco35l'
|
|
_IMAGES_PATH = '/tmp/data/mscoco/images'
|
|
_COCOCAPS_PATH = '/tmp/data/mscoco/dataset_coco.json'
|
|
|
|
LANGUAGES = [
|
|
'ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr',
|
|
'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl',
|
|
'pt', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh',
|
|
]
|
|
|
|
|
|
class Coco35l(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for COCO-35L dataset."""
|
|
|
|
VERSION = tfds.core.Version('1.0.0')
|
|
RELEASE_NOTES = {'1.0.0': 'First release.'}
|
|
|
|
def _info(self):
|
|
"""Returns the metadata."""
|
|
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
description=_DESCRIPTION,
|
|
features=tfds.features.FeaturesDict({
|
|
'image/id': tfds.features.Text(),
|
|
'image': tfds.features.Image(encoding_format='jpeg'),
|
|
'captions': tfds.features.Sequence(tfds.features.Text()),
|
|
'language': tfds.features.Text(),
|
|
}),
|
|
supervised_keys=None,
|
|
homepage='https://google.github.io/crossmodal-3600/',
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
"""Returns SplitGenerators."""
|
|
splits = []
|
|
for lang in LANGUAGES:
|
|
splits.extend([f'train_{lang}', f'dev_{lang}'])
|
|
return {split: self._generate_examples(split) for split in splits}
|
|
|
|
def _generate_examples(self, split: str):
|
|
"""Yields (key, example) tuples from dataset."""
|
|
split, language = split.split('_')
|
|
|
|
id_to_path = dict()
|
|
with open(_COCOCAPS_PATH, 'r') as f:
|
|
data = json.load(f)['images']
|
|
for d in data:
|
|
id_to_path[d['cocoid']] = os.path.join(
|
|
_IMAGES_PATH, d['filepath'], d['filename']
|
|
)
|
|
|
|
annot_fname = os.path.join(_CAPTIONS_PATH, f'coco_mt_{split}.jsonl')
|
|
data = {}
|
|
with open(annot_fname, 'r') as f:
|
|
for line in f:
|
|
j = json.loads(line)
|
|
image_id = f'{j["image_id"].split("_")[0]}_{language}'
|
|
if image_id not in data:
|
|
data[image_id] = []
|
|
if language == 'en':
|
|
|
|
|
|
|
|
if j['trg_lang'] == 'de':
|
|
data[image_id].append(j['caption_tokenized'])
|
|
else:
|
|
if j['trg_lang'] == language:
|
|
data[image_id].append(j['translation_tokenized'])
|
|
|
|
for image_id, captions in data.items():
|
|
yield image_id, {
|
|
'image/id': image_id,
|
|
'image': id_to_path[int(image_id.split('_')[0])],
|
|
'captions': captions,
|
|
'language': language,
|
|
}
|
|
|