|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Import widgetcap into TFDS format.
|
|
|
|
Widget Captioning all requires images from the RICO dataset:
|
|
mkdir -p /tmp/data/rico_images ; cd /tmp/data/rico_images
|
|
wget
|
|
https://storage.googleapis.com/crowdstf-rico-uiuc-4540/rico_dataset_v0.1/unique_uis.tar.gz
|
|
tar xvfz unique_uis.tar.gz
|
|
rm unique_uis.tar.gz
|
|
|
|
Widget Captioning:
|
|
mkdir - /tmp/data/widget_captioning ; cd /tmp/data/widget_captioning
|
|
git clone https://github.com/google-research-datasets/widget-caption.git
|
|
cp widget-caption/widget_captions.csv ./
|
|
cp widget-caption/split/*.txt ./
|
|
rm -rf widget-caption
|
|
|
|
Then, run conversion locally (make sure to install tensorflow-datasets for the
|
|
`tfds` util):
|
|
|
|
cd big_vision/datasets
|
|
env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=widgetcap
|
|
|
|
Example to load:
|
|
|
|
import tensorflow_datasets as tfds
|
|
dataset_augmented = tfds.load('widgetcap', split='train',
|
|
data_dir='/tmp/tfds')
|
|
"""
|
|
import csv
|
|
import json
|
|
import os
|
|
|
|
import numpy as np
|
|
from PIL import Image
|
|
import tensorflow_datasets as tfds
|
|
|
|
_DATASET_DIR = '/tmp/data/widget_captioning'
|
|
|
|
_RICO_CANVAS_Y = 2560
|
|
_IMAGE_DIR = '/tmp/data/rico_images/combined'
|
|
|
|
_CITATION = (
|
|
'@inproceedings{Li2020WidgetCG,title={Widget Captioning: Generating Natural'
|
|
' Language Description for MobileUser Interface Elements},author={Y. Li and'
|
|
' Gang Li and Luheng He and Jingjie Zheng and Hong Li and Zhiwei'
|
|
' Guan},booktitle={Conference on Empirical Methods in Natural Language'
|
|
' Processing},year={2020},}'
|
|
)
|
|
|
|
|
|
class Widgetcap(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for widgetcap dataset."""
|
|
|
|
VERSION = tfds.core.Version('1.0.0')
|
|
RELEASE_NOTES = {'1.0.0': 'Format as needed for PaliGemma'}
|
|
|
|
def _info(self) -> tfds.core.DatasetInfo:
|
|
"""Returns the metadata."""
|
|
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
description='The widgetcap dataset.',
|
|
features=tfds.features.FeaturesDict({
|
|
'image/id': tfds.features.Text(),
|
|
'image/filename': tfds.features.Text(),
|
|
'image': tfds.features.Image(encoding_format='jpeg'),
|
|
'texts': tfds.features.Sequence(tfds.features.Text()),
|
|
'bbox': tfds.features.BBoxFeature(),
|
|
'screen_id': tfds.features.Text(),
|
|
'node_id': tfds.features.Text(),
|
|
'height': np.int32,
|
|
'width': np.int32,
|
|
}),
|
|
homepage='https://github.com/google-research-datasets/widget-caption',
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
"""Returns SplitGenerators."""
|
|
return {
|
|
'train': self._generate_examples('train'),
|
|
'dev': self._generate_examples('dev'),
|
|
'test': self._generate_examples('test'),
|
|
}
|
|
|
|
def _generate_examples(self, split):
|
|
"""Yields (key, example) tuples from the dataset."""
|
|
split_screen_ids = set()
|
|
with open(os.path.join(_DATASET_DIR, split + '.txt')) as f:
|
|
for line in f:
|
|
split_screen_ids.add(line.strip())
|
|
|
|
with open(os.path.join(_DATASET_DIR, 'widget_captions.csv')) as f:
|
|
for row in csv.DictReader(f):
|
|
if row['screenId'] in split_screen_ids:
|
|
id_, example = self._get_example(
|
|
row['screenId'], row['nodeId'], row['captions']
|
|
)
|
|
yield id_, example
|
|
|
|
def _get_node_box(self, screen_id, node_id, height):
|
|
index_list = [int(i) for i in node_id.split('.')[1:]]
|
|
with open(os.path.join(_IMAGE_DIR, screen_id + '.json')) as f:
|
|
view = json.load(f)
|
|
curr_node = view['activity']['root']
|
|
for index in index_list:
|
|
curr_node = curr_node['children'][index]
|
|
normalized_bounds = map(
|
|
lambda x: x * height / _RICO_CANVAS_Y, curr_node['bounds']
|
|
)
|
|
return normalized_bounds
|
|
|
|
def _get_example(self, screen_id, node_id, captions):
|
|
image = Image.open(os.path.join(_IMAGE_DIR, screen_id + '.jpg'))
|
|
width, height = image.size
|
|
|
|
xmin, ymin, xmax, ymax = self._get_node_box(screen_id, node_id, height)
|
|
|
|
image_id = f'{screen_id}_{node_id}'
|
|
example = {
|
|
'image/id': image_id,
|
|
'image/filename': screen_id + '.jpg',
|
|
'image': os.path.join(_IMAGE_DIR, screen_id + '.jpg'),
|
|
'texts': captions.split('|'),
|
|
'bbox': tfds.features.BBox(
|
|
ymin=ymin / height,
|
|
xmin=xmin / width,
|
|
ymax=ymax / height,
|
|
xmax=xmax / width,
|
|
),
|
|
'screen_id': screen_id,
|
|
'node_id': node_id,
|
|
'height': height,
|
|
'width': width,
|
|
}
|
|
return image_id, example
|
|
|