ydshieh
/

flax-vision-encoder-decoder-vit-gpt2-coco-en

Model card Files Files and versions

xet

Community

ydshieh commited on Dec 19, 2021

Commit

03bb4e1

1 Parent(s): 3acf293

fix

Browse files

Files changed (1) hide show

coco_dataset/coco_dataset.py +42 -19

coco_dataset/coco_dataset.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 import datasets
-class ImageCaptionBuilderConfig(datasets.BuilderConfig):
     def __init__(self, name, splits, **kwargs):
@@ -13,48 +13,67 @@ class ImageCaptionBuilderConfig(datasets.BuilderConfig):
         self.splits = splits
-# TODO: Add BibTeX citation
 # Find for instance the citation on arxiv or on the dataset repo/website
 _CITATION = """\
-@InProceedings{None,
-    title = {COCO dataset},
-    author={...},
-    year={...}
 }
 """
-# TODO: Add description of the dataset here
 # You can copy an official description
 _DESCRIPTION = """\
 """
-# TODO: Add a link to an official homepage for the dataset here
-_HOMEPAGE = ""
-# TODO: Add the licence for the dataset here if you can find it
 _LICENSE = ""
-# TODO: Add link to the official dataset URLs here
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
 _URLs = {}
-# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
     VERSION = datasets.Version("0.0.0")
-    BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
     BUILDER_CONFIGS = [
-        ImageCaptionBuilderConfig(name='2017', splits=['train', 'valid', 'test']),
     ]
     DEFAULT_CONFIG_NAME = "2017"
     def _info(self):
-        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
         feature_dict = {
             "image_id": datasets.Value("int64"),
@@ -88,10 +107,14 @@ class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
         # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
         data_dir = self.config.data_dir
         splits = []
         for split in self.config.splits:

 import datasets
+class COCOBuilderConfig(datasets.BuilderConfig):
     def __init__(self, name, splits, **kwargs):
         self.splits = splits
+# Add BibTeX citation
 # Find for instance the citation on arxiv or on the dataset repo/website
 _CITATION = """\
+@article{DBLP:journals/corr/LinMBHPRDZ14,
+  author    = {Tsung{-}Yi Lin and
+               Michael Maire and
+               Serge J. Belongie and
+               Lubomir D. Bourdev and
+               Ross B. Girshick and
+               James Hays and
+               Pietro Perona and
+               Deva Ramanan and
+               Piotr Doll{'{a} }r and
+               C. Lawrence Zitnick},
+  title     = {Microsoft {COCO:} Common Objects in Context},
+  journal   = {CoRR},
+  volume    = {abs/1405.0312},
+  year      = {2014},
+  url       = {http://arxiv.org/abs/1405.0312},
+  archivePrefix = {arXiv},
+  eprint    = {1405.0312},
+  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
 }
 """
+# Add description of the dataset here
 # You can copy an official description
 _DESCRIPTION = """\
+COCO is a large-scale object detection, segmentation, and captioning dataset.
 """
+# Add a link to an official homepage for the dataset here
+_HOMEPAGE = "http://cocodataset.org/#home"
+# Add the licence for the dataset here if you can find it
 _LICENSE = ""
+# Add link to the official dataset URLs here
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+# This script is supposed to work with local (downloaded) COCO dataset.
 _URLs = {}
+# Name of the dataset usually match the script name with CamelCase instead of snake_case
+class COCODataset(datasets.GeneratorBasedBuilder):
+    """An example dataset script to work with the local (downloaded) COCO dataset"""
     VERSION = datasets.Version("0.0.0")
+    BUILDER_CONFIG_CLASS = COCOBuilderConfig
     BUILDER_CONFIGS = [
+        COCOBuilderConfig(name='2017', splits=['train', 'valid', 'test']),
     ]
     DEFAULT_CONFIG_NAME = "2017"
     def _info(self):
+        # This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
         feature_dict = {
             "image_id": datasets.Value("int64"),
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
+        # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
         # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
         data_dir = self.config.data_dir
+        if not data_dir:
+            raise ValueError(
+                "This script is supposed to work with local (downloaded) COCO dataset. The argument `data_dir` in `load_dataset()` is required."
+            )
         splits = []
         for split in self.config.splits: