|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Import VQAv2 into TFDS format. Uses coco-2014 images.
|
|
|
|
It's small data, so simple to run locally. First, download all the data:
|
|
|
|
mkdir /tmp/data/ ; cd /tmp/data
|
|
wget http://images.cocodataset.org/zips/{train2014,val2014,test2015}.zip
|
|
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_{Train,Val,Test}_mscoco.zip
|
|
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_{Train,Val}_mscoco.zip
|
|
unzip '*.zip'
|
|
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
|
|
|
|
cd big_vision/datasets
|
|
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=vqa
|
|
|
|
It runs at around 750 examples/sec, so takes around 25min for the 1.2M questions.
|
|
Each question is an example; images are repeated, a bit wasteful, but disk is cheap.
|
|
|
|
|
|
Example to load:
|
|
|
|
import tensorflow_datasets as tfds
|
|
dataset = tfds.load('vqa', split='train', data_dir='/tmp/tfds')
|
|
"""
|
|
import json
|
|
import os
|
|
|
|
import numpy as np
|
|
import tensorflow_datasets as tfds
|
|
|
|
|
|
_VQAV2_PATH = '/tmp/data'
|
|
_IMAGE_PATH = '/tmp/data'
|
|
|
|
|
|
_CITATION = (
|
|
'@InProceedings{balanced_vqa_v2,'
|
|
'author = {Yash Goyal and Tejas Khot and '
|
|
'Douglas Summers{-}Stay and Dhruv Batra and Devi Parikh},'
|
|
'title = {Making the {V} in {VQA} Matter: Elevating the Role of Image'
|
|
'Understanding in {V}isual {Q}uestion {A}nswering},'
|
|
'booktitle = {Computer Vision and Pattern Recognition (CVPR)},'
|
|
'year = {2017},}')
|
|
|
|
|
|
class Vqa(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for VQAv2 dataset."""
|
|
|
|
VERSION = tfds.core.Version('3.0.0')
|
|
RELEASE_NOTES = {'3.0.0': 'Format as needed for PaliGemma'}
|
|
|
|
def _info(self) -> tfds.core.DatasetInfo:
|
|
"""Returns the metadata."""
|
|
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
description='The VQAv2 dataset.',
|
|
features=tfds.features.FeaturesDict({
|
|
'image/id': np.int32,
|
|
'image/filename': tfds.features.Text(),
|
|
'image': tfds.features.Image(encoding_format='jpeg'),
|
|
'question_id': np.int32,
|
|
'question_type': tfds.features.Text(),
|
|
'question_text': tfds.features.Text(),
|
|
'answer_type': tfds.features.Text(),
|
|
'answers': tfds.features.Sequence(tfds.features.Text()),
|
|
'answer_confidences': tfds.features.Sequence(
|
|
tfds.features.ClassLabel(names=['no', 'maybe', 'yes'])),
|
|
'top_answer': tfds.features.Text(),
|
|
}),
|
|
homepage='https://visualqa.org/',
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
"""Returns SplitGenerators."""
|
|
return {
|
|
'train': self._generate_examples('train2014'),
|
|
'validation': self._generate_examples('val2014'),
|
|
'test': self._generate_examples('test2015'),
|
|
'test-dev': self._generate_examples('test-dev2015', 'test2015'),
|
|
}
|
|
|
|
def _generate_examples(self, split, image_folder=None):
|
|
"""Yields (key, example) tuples from test set."""
|
|
image_folder = image_folder or split
|
|
|
|
|
|
with open(os.path.join(
|
|
_VQAV2_PATH, f'v2_OpenEnded_mscoco_{split}_questions.json')) as f:
|
|
examples = json.load(f)['questions']
|
|
|
|
|
|
|
|
if 'test' not in split:
|
|
with open(os.path.join(
|
|
_VQAV2_PATH, f'v2_mscoco_{split}_annotations.json')) as f:
|
|
annots = {a['question_id']: a for a in json.load(f)['annotations']}
|
|
|
|
for ex in examples:
|
|
qid = ex['question_id']
|
|
ex = {
|
|
'image/id': ex['image_id'],
|
|
'question_id': qid,
|
|
'question_text': ex['question'],
|
|
}
|
|
if 'test' not in split:
|
|
fname = f'COCO_{image_folder}_{ex["image/id"]:012d}.jpg'
|
|
ex['image/filename'] = fname
|
|
ex['image'] = os.path.join(_IMAGE_PATH, image_folder, fname)
|
|
ann = annots[qid]
|
|
ex['question_type'] = ann['question_type']
|
|
ex['answer_type'] = ann['answer_type']
|
|
ex['answers'] = [a['answer'] for a in ann['answers']]
|
|
ex['answer_confidences'] = [a['answer_confidence']
|
|
for a in ann['answers']]
|
|
ex['top_answer'] = ann['multiple_choice_answer']
|
|
else:
|
|
|
|
fname = f'COCO_{image_folder}_{ex["image/id"]:012d}.jpg'
|
|
ex['image/filename'] = fname
|
|
if os.path.isfile(path := os.path.join(_IMAGE_PATH, image_folder, fname)):
|
|
ex['image'] = path
|
|
else:
|
|
print(ex['image/id'])
|
|
continue
|
|
ex['question_type'] = ''
|
|
ex['answer_type'] = ''
|
|
ex['answers'] = []
|
|
ex['answer_confidences'] = []
|
|
ex['top_answer'] = ''
|
|
yield qid, ex
|
|
|