ydshieh commited on
Commit
03bb4e1
·
1 Parent(s): 3acf293
Files changed (1) hide show
  1. coco_dataset/coco_dataset.py +42 -19
coco_dataset/coco_dataset.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import datasets
5
 
6
 
7
- class ImageCaptionBuilderConfig(datasets.BuilderConfig):
8
 
9
  def __init__(self, name, splits, **kwargs):
10
 
@@ -13,48 +13,67 @@ class ImageCaptionBuilderConfig(datasets.BuilderConfig):
13
  self.splits = splits
14
 
15
 
16
- # TODO: Add BibTeX citation
17
  # Find for instance the citation on arxiv or on the dataset repo/website
18
  _CITATION = """\
19
- @InProceedings{None,
20
- title = {COCO dataset},
21
- author={...},
22
- year={...}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  """
25
 
26
- # TODO: Add description of the dataset here
27
  # You can copy an official description
28
  _DESCRIPTION = """\
29
-
30
  """
31
 
32
- # TODO: Add a link to an official homepage for the dataset here
33
- _HOMEPAGE = ""
34
 
35
- # TODO: Add the licence for the dataset here if you can find it
36
  _LICENSE = ""
37
 
38
- # TODO: Add link to the official dataset URLs here
39
  # The HuggingFace dataset library don't host the datasets but only point to the original files
40
  # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
 
 
41
  _URLs = {}
42
 
43
 
44
- # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
45
- class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
46
- """TODO: Short description of my dataset."""
47
 
48
  VERSION = datasets.Version("0.0.0")
49
 
50
- BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
51
  BUILDER_CONFIGS = [
52
- ImageCaptionBuilderConfig(name='2017', splits=['train', 'valid', 'test']),
53
  ]
54
  DEFAULT_CONFIG_NAME = "2017"
55
 
56
  def _info(self):
57
- # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
58
 
59
  feature_dict = {
60
  "image_id": datasets.Value("int64"),
@@ -88,10 +107,14 @@ class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
88
 
89
  def _split_generators(self, dl_manager):
90
  """Returns SplitGenerators."""
91
- # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
92
  # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
93
 
94
  data_dir = self.config.data_dir
 
 
 
 
95
 
96
  splits = []
97
  for split in self.config.splits:
 
4
  import datasets
5
 
6
 
7
+ class COCOBuilderConfig(datasets.BuilderConfig):
8
 
9
  def __init__(self, name, splits, **kwargs):
10
 
 
13
  self.splits = splits
14
 
15
 
16
+ # Add BibTeX citation
17
  # Find for instance the citation on arxiv or on the dataset repo/website
18
  _CITATION = """\
19
+ @article{DBLP:journals/corr/LinMBHPRDZ14,
20
+ author = {Tsung{-}Yi Lin and
21
+ Michael Maire and
22
+ Serge J. Belongie and
23
+ Lubomir D. Bourdev and
24
+ Ross B. Girshick and
25
+ James Hays and
26
+ Pietro Perona and
27
+ Deva Ramanan and
28
+ Piotr Doll{'{a} }r and
29
+ C. Lawrence Zitnick},
30
+ title = {Microsoft {COCO:} Common Objects in Context},
31
+ journal = {CoRR},
32
+ volume = {abs/1405.0312},
33
+ year = {2014},
34
+ url = {http://arxiv.org/abs/1405.0312},
35
+ archivePrefix = {arXiv},
36
+ eprint = {1405.0312},
37
+ timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
38
+ biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
39
+ bibsource = {dblp computer science bibliography, https://dblp.org}
40
  }
41
  """
42
 
43
+ # Add description of the dataset here
44
  # You can copy an official description
45
  _DESCRIPTION = """\
46
+ COCO is a large-scale object detection, segmentation, and captioning dataset.
47
  """
48
 
49
+ # Add a link to an official homepage for the dataset here
50
+ _HOMEPAGE = "http://cocodataset.org/#home"
51
 
52
+ # Add the licence for the dataset here if you can find it
53
  _LICENSE = ""
54
 
55
+ # Add link to the official dataset URLs here
56
  # The HuggingFace dataset library don't host the datasets but only point to the original files
57
  # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
58
+
59
+ # This script is supposed to work with local (downloaded) COCO dataset.
60
  _URLs = {}
61
 
62
 
63
+ # Name of the dataset usually match the script name with CamelCase instead of snake_case
64
+ class COCODataset(datasets.GeneratorBasedBuilder):
65
+ """An example dataset script to work with the local (downloaded) COCO dataset"""
66
 
67
  VERSION = datasets.Version("0.0.0")
68
 
69
+ BUILDER_CONFIG_CLASS = COCOBuilderConfig
70
  BUILDER_CONFIGS = [
71
+ COCOBuilderConfig(name='2017', splits=['train', 'valid', 'test']),
72
  ]
73
  DEFAULT_CONFIG_NAME = "2017"
74
 
75
  def _info(self):
76
+ # This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
77
 
78
  feature_dict = {
79
  "image_id": datasets.Value("int64"),
 
107
 
108
  def _split_generators(self, dl_manager):
109
  """Returns SplitGenerators."""
110
+ # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
111
  # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
112
 
113
  data_dir = self.config.data_dir
114
+ if not data_dir:
115
+ raise ValueError(
116
+ "This script is supposed to work with local (downloaded) COCO dataset. The argument `data_dir` in `load_dataset()` is required."
117
+ )
118
 
119
  splits = []
120
  for split in self.config.splits: