Upload 304 files

fa1a600 verified 10 months ago

5.65 kB

	# Copyright 2024 Big Vision Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# pylint: disable=line-too-long
	r"""Implements ScienceQA train/val/test-set in TFDS structure.

	First, download the science QA dataset from their website https://scienceqa.github.io/#download
	- mkdir -p /tmp/data/ScienceQA_DATA
	- From Google Drive: https://drive.google.com/corp/drive/folders/1w8imCXWYn2LxajmGeGH_g5DaL2rabHev
	Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
	- cd big_vision/datasets
	- env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=science_qa

	Example to load:

	import tensorflow_datasets as tfds
	dataset = tfds.load(
	'science_qa', split='train',
	data_dir='/tmp/tfds')

	"""
	import json
	import os

	import numpy as np
	import tensorflow_datasets as tfds


	_DESCRIPTION = """Sci QA test-set."""

	# pylint: disable=line-too-long
	_CITATION = """
	@inproceedings{lu2022learn,
	title={Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering},
	author={Lu, Pan and Mishra, Swaroop and Xia, Tony and Qiu, Liang and Chang, Kai-Wei and Zhu, Song-Chun and Tafjord, Oyvind and Clark, Peter and Ashwin Kalyan},
	booktitle={The 36th Conference on Neural Information Processing Systems (NeurIPS)},
	year={2022}
	}
	"""
	# pylint: enable=line-too-long

	# When running locally (recommended), copy files as above an use these:
	_SCIQA_PATH = '/tmp/data/ScienceQA_DATA/'
	# _IMAGE_COCO_PATH = '/tmp/data/val2014'

	_ALPHABETS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'


	class ScienceQA(tfds.core.GeneratorBasedBuilder):
	"""DatasetBuilder for ScienceQA dataset."""

	VERSION = tfds.core.Version('1.0.0')
	RELEASE_NOTES = {'1.0.0': 'First release.'}

	def _info(self):
	"""Returns the metadata."""

	return tfds.core.DatasetInfo(
	builder=self,
	description=_DESCRIPTION,
	features=tfds.features.FeaturesDict({
	'question': tfds.features.Text(),
	'choices': tfds.features.Sequence(tfds.features.Text()),
	'answer': tfds.features.Scalar(np.int32),
	'hint': tfds.features.Text(),
	'task': tfds.features.Text(),
	'grade': tfds.features.Text(),
	'subject': tfds.features.Text(),
	'topic': tfds.features.Text(),
	'category': tfds.features.Text(),
	'skill': tfds.features.Text(),
	'lecture': tfds.features.Text(),
	'solution': tfds.features.Text(),
	'image': tfds.features.Image(encoding_format='png'),
	'indexed_choices': tfds.features.Text(),
	'indexed_answer': tfds.features.Text(),
	}),
	supervised_keys=None,
	homepage='https://github.com/lupantech/ScienceQA/tree/main',
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager: tfds.download.DownloadManager):
	"""Returns SplitGenerators."""
	return {
	split: self._generate_examples(split)
	for split in ('train', 'test', 'val')
	}

	def _generate_examples(self, split):
	"""Yields (key, example) tuples from test set."""
	annot_fname = os.path.join(_SCIQA_PATH, 'problems.json')

	with open(annot_fname, 'r') as f:
	data = json.loads(f.read())

	for k, v in data.items():
	if v['split'] == split: # "split":"train"
	image = v['image']
	# Science QA contains the example without image as well. As this
	# conversion is for VQA tasks, we dropped the examples without Image.
	# TODO: Include the examples without image, and udpate the
	# downstream pipeline to skip the examples without image, instead of
	# doing it at pre-processing.
	if image:
	image = os.path.join(f'{_SCIQA_PATH}/{split}/{k}/', f'{image}')
	else:
	# image = None
	continue
	question = v['question']
	choices = v['choices']
	answer = v['answer']
	hint = v['hint']
	if not hint:
	hint = 'N/A' # align with orignal github implementation
	task = v['task']
	grade = v['grade']
	subject = v['subject']
	topic = v['topic']
	category = v['category']
	skill = v['skill']
	lecture = v['lecture']
	solution = v['solution']
	split = v['split']
	indexed_choices = ', '.join(
	f'({_ALPHABETS[i]}) {c}' for i, c in enumerate(choices)
	)
	indexed_answer = _ALPHABETS[int(answer)]
	yield int(k), {
	'question': question,
	'choices': choices,
	'answer': answer,
	'hint': hint,
	'task': task,
	'grade': grade,
	'subject': subject,
	'topic': topic,
	'category': category,
	'skill': skill,
	'lecture': lecture,
	'solution': solution,
	'image': image,
	'indexed_choices': indexed_choices,
	'indexed_answer': indexed_answer,
	}