|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Generates xGQA in a TFDS-ready structure.
|
|
|
|
First, download the data:
|
|
mkdir -p /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_bn.json -P /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_de.json -P /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_en.json -P /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_id.json -P /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_ko.json -P /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_pt.json -P /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_ru.json -P /tmp/data/xgqa/annotations
|
|
wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_zh.json -P /tmp/data/xgqa/annotations
|
|
wget https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip -P /tmp/data/xgqa/
|
|
unzip /tmp/data/xgqa/images.zip -d /tmp/data/xgqa/
|
|
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
|
|
|
|
cd big_vision/datasets
|
|
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=xgqa
|
|
|
|
Example to load:
|
|
|
|
import tensorflow_datasets as tfds
|
|
dataset = tfds.load(
|
|
'xgqa', split='test_zs_en',
|
|
data_dir='/tmp/tfds')
|
|
"""
|
|
import json
|
|
import os
|
|
|
|
import tensorflow_datasets as tfds
|
|
|
|
_DESCRIPTION = """xGQA (uses GQA images)."""
|
|
|
|
|
|
_CITATION = (
|
|
'@inproceedings{pfeiffer-etal-2022-xgqa,'
|
|
'title = "x{GQA}: Cross-Lingual Visual Question Answering",'
|
|
'author = "Pfeiffer, Jonas and'
|
|
' Geigle, Gregor and'
|
|
' Kamath, Aishwarya and'
|
|
' Steitz, Jan-Martin and'
|
|
' Roth, Stefan and'
|
|
' Vuli{\'c}, Ivan and'
|
|
' Gurevych, Iryna",'
|
|
'booktitle = "Findings of the Association for Computational Linguistics: '
|
|
'ACL 2022",'
|
|
'month = may,'
|
|
'year = "2022",'
|
|
'address = "Dublin, Ireland",'
|
|
'publisher = "Association for Computational Linguistics",'
|
|
'url = "https://aclanthology.org/2022.findings-acl.196",'
|
|
'doi = "10.18653/v1/2022.findings-acl.196",'
|
|
'pages = "2497--2511",'
|
|
'}'
|
|
)
|
|
|
|
|
|
|
|
_DATA_PATH = '/tmp/data/xgqa/'
|
|
_IMAGE_PATH = '/tmp/data/xgqa/images/'
|
|
|
|
LANGUAGES = frozenset(['bn', 'de', 'en', 'id', 'ko', 'pt', 'ru', 'zh'])
|
|
|
|
|
|
class XGQA(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for XGQA dataset."""
|
|
|
|
VERSION = tfds.core.Version('1.0.0')
|
|
RELEASE_NOTES = {'1.0.0': 'First release.'}
|
|
|
|
def _info(self):
|
|
"""Returns the metadata."""
|
|
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
description=_DESCRIPTION,
|
|
features=tfds.features.FeaturesDict({
|
|
'example_id': tfds.features.Text(),
|
|
'image/id': tfds.features.Text(),
|
|
'image': tfds.features.Image(encoding_format='jpeg'),
|
|
'question': tfds.features.Text(),
|
|
'answer': tfds.features.Text(),
|
|
}),
|
|
supervised_keys=None,
|
|
homepage='https://github.com/adapter-hub/xGQA',
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
"""Returns SplitGenerators."""
|
|
d = dict()
|
|
for l in LANGUAGES:
|
|
d.update({
|
|
f'test_zs_{l}': self._generate_examples('test', 'zero_shot', l),
|
|
f'test_fs_{l}': self._generate_examples('test', 'few_shot', l),
|
|
f'dev_fs_{l}': self._generate_examples('test', 'few_shot', l),
|
|
f'train_fs1_{l}': self._generate_examples('train_1', 'few_shot', l),
|
|
f'train_fs5_{l}': self._generate_examples('train_5', 'few_shot', l),
|
|
f'train_fs10_{l}': self._generate_examples('train_10', 'few_shot', l),
|
|
f'train_fs20_{l}': self._generate_examples('train_20', 'few_shot', l),
|
|
f'train_fs25_{l}': self._generate_examples('train_25', 'few_shot', l),
|
|
f'train_fs48_{l}': self._generate_examples('train_48', 'few_shot', l),
|
|
})
|
|
return d
|
|
|
|
def _generate_examples(self, split, num_shots, lang):
|
|
"""Yields (key, example) tuples."""
|
|
|
|
if num_shots == 'few_shot':
|
|
file_path = os.path.join(_DATA_PATH, 'annotations', 'few_shot', lang,
|
|
f'{split}.json')
|
|
elif num_shots == 'zero_shot':
|
|
file_path = os.path.join(_DATA_PATH, 'annotations', 'zero_shot',
|
|
f'testdev_balanced_questions_{lang}.json')
|
|
else:
|
|
raise ValueError(f'Unknown num_shots: {num_shots}')
|
|
with open(file_path, 'r') as f:
|
|
entries = json.load(f)
|
|
|
|
|
|
for question_id, question_data in entries.items():
|
|
example_id = f'{question_id}_{lang}'
|
|
yield example_id, {
|
|
'example_id': example_id,
|
|
'image/id': question_data['imageId'],
|
|
'image': os.path.join(_IMAGE_PATH, f'{question_data["imageId"]}.jpg'),
|
|
'question': question_data['question'],
|
|
'answer': question_data['answer'],
|
|
}
|
|
|