|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Implements ScienceQA train/val/test-set in TFDS structure.
|
|
|
|
First, download the science QA dataset from their website https://scienceqa.github.io/#download
|
|
- mkdir -p /tmp/data/ScienceQA_DATA
|
|
- From Google Drive: https://drive.google.com/corp/drive/folders/1w8imCXWYn2LxajmGeGH_g5DaL2rabHev
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
|
|
- cd big_vision/datasets
|
|
- env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=science_qa
|
|
|
|
Example to load:
|
|
|
|
import tensorflow_datasets as tfds
|
|
dataset = tfds.load(
|
|
'science_qa', split='train',
|
|
data_dir='/tmp/tfds')
|
|
|
|
"""
|
|
import json
|
|
import os
|
|
|
|
import numpy as np
|
|
import tensorflow_datasets as tfds
|
|
|
|
|
|
_DESCRIPTION = """Sci QA test-set."""
|
|
|
|
|
|
_CITATION = """
|
|
@inproceedings{lu2022learn,
|
|
title={Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering},
|
|
author={Lu, Pan and Mishra, Swaroop and Xia, Tony and Qiu, Liang and Chang, Kai-Wei and Zhu, Song-Chun and Tafjord, Oyvind and Clark, Peter and Ashwin Kalyan},
|
|
booktitle={The 36th Conference on Neural Information Processing Systems (NeurIPS)},
|
|
year={2022}
|
|
}
|
|
"""
|
|
|
|
|
|
|
|
_SCIQA_PATH = '/tmp/data/ScienceQA_DATA/'
|
|
|
|
|
|
_ALPHABETS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
|
|
|
|
class ScienceQA(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for ScienceQA dataset."""
|
|
|
|
VERSION = tfds.core.Version('1.0.0')
|
|
RELEASE_NOTES = {'1.0.0': 'First release.'}
|
|
|
|
def _info(self):
|
|
"""Returns the metadata."""
|
|
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
description=_DESCRIPTION,
|
|
features=tfds.features.FeaturesDict({
|
|
'question': tfds.features.Text(),
|
|
'choices': tfds.features.Sequence(tfds.features.Text()),
|
|
'answer': tfds.features.Scalar(np.int32),
|
|
'hint': tfds.features.Text(),
|
|
'task': tfds.features.Text(),
|
|
'grade': tfds.features.Text(),
|
|
'subject': tfds.features.Text(),
|
|
'topic': tfds.features.Text(),
|
|
'category': tfds.features.Text(),
|
|
'skill': tfds.features.Text(),
|
|
'lecture': tfds.features.Text(),
|
|
'solution': tfds.features.Text(),
|
|
'image': tfds.features.Image(encoding_format='png'),
|
|
'indexed_choices': tfds.features.Text(),
|
|
'indexed_answer': tfds.features.Text(),
|
|
}),
|
|
supervised_keys=None,
|
|
homepage='https://github.com/lupantech/ScienceQA/tree/main',
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
"""Returns SplitGenerators."""
|
|
return {
|
|
split: self._generate_examples(split)
|
|
for split in ('train', 'test', 'val')
|
|
}
|
|
|
|
def _generate_examples(self, split):
|
|
"""Yields (key, example) tuples from test set."""
|
|
annot_fname = os.path.join(_SCIQA_PATH, 'problems.json')
|
|
|
|
with open(annot_fname, 'r') as f:
|
|
data = json.loads(f.read())
|
|
|
|
for k, v in data.items():
|
|
if v['split'] == split:
|
|
image = v['image']
|
|
|
|
|
|
|
|
|
|
|
|
if image:
|
|
image = os.path.join(f'{_SCIQA_PATH}/{split}/{k}/', f'{image}')
|
|
else:
|
|
|
|
continue
|
|
question = v['question']
|
|
choices = v['choices']
|
|
answer = v['answer']
|
|
hint = v['hint']
|
|
if not hint:
|
|
hint = 'N/A'
|
|
task = v['task']
|
|
grade = v['grade']
|
|
subject = v['subject']
|
|
topic = v['topic']
|
|
category = v['category']
|
|
skill = v['skill']
|
|
lecture = v['lecture']
|
|
solution = v['solution']
|
|
split = v['split']
|
|
indexed_choices = ', '.join(
|
|
f'({_ALPHABETS[i]}) {c}' for i, c in enumerate(choices)
|
|
)
|
|
indexed_answer = _ALPHABETS[int(answer)]
|
|
yield int(k), {
|
|
'question': question,
|
|
'choices': choices,
|
|
'answer': answer,
|
|
'hint': hint,
|
|
'task': task,
|
|
'grade': grade,
|
|
'subject': subject,
|
|
'topic': topic,
|
|
'category': category,
|
|
'skill': skill,
|
|
'lecture': lecture,
|
|
'solution': solution,
|
|
'image': image,
|
|
'indexed_choices': indexed_choices,
|
|
'indexed_answer': indexed_answer,
|
|
}
|
|
|