add CV 11
Browse files- README.template.md +2 -2
- generate_datasets.py +6 -0
- languages.ftl +8 -0
- test.py +1 -1
README.template.md
CHANGED
|
@@ -131,7 +131,7 @@ Additional fields include `accent`, `age`, `client_id`, `up_votes`, `down_votes`
|
|
| 131 |
|
| 132 |
The speech material has been subdivided into portions for dev, train, test, validated, invalidated, reported and other.
|
| 133 |
|
| 134 |
-
The validated data is data that has been validated with reviewers and
|
| 135 |
|
| 136 |
The invalidated data is data has been invalidated by reviewers
|
| 137 |
and received downvotes indicating that the data is of low quality.
|
|
@@ -153,7 +153,7 @@ In addition, the majority of training sentences end in punctuation ( . or ? or !
|
|
| 153 |
```python
|
| 154 |
from datasets import load_dataset
|
| 155 |
|
| 156 |
-
ds = load_dataset("mozilla-foundation/{{
|
| 157 |
|
| 158 |
def prepare_dataset(batch):
|
| 159 |
"""Function to preprocess the dataset with the .map method"""
|
|
|
|
| 131 |
|
| 132 |
The speech material has been subdivided into portions for dev, train, test, validated, invalidated, reported and other.
|
| 133 |
|
| 134 |
+
The validated data is data that has been validated with reviewers and received upvotes that the data is of high quality.
|
| 135 |
|
| 136 |
The invalidated data is data has been invalidated by reviewers
|
| 137 |
and received downvotes indicating that the data is of low quality.
|
|
|
|
| 153 |
```python
|
| 154 |
from datasets import load_dataset
|
| 155 |
|
| 156 |
+
ds = load_dataset("mozilla-foundation/{{DATASET_PATH}}", "en", use_auth_token=True)
|
| 157 |
|
| 158 |
def prepare_dataset(batch):
|
| 159 |
"""Function to preprocess the dataset with the .map method"""
|
generate_datasets.py
CHANGED
|
@@ -54,6 +54,11 @@ VERSIONS = [
|
|
| 54 |
"name": "common_voice_10_0",
|
| 55 |
"release": "cv-corpus-10.0-2022-07-04",
|
| 56 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
]
|
| 58 |
|
| 59 |
|
|
@@ -102,6 +107,7 @@ def main():
|
|
| 102 |
with open(f"README.template.md", "r") as fin:
|
| 103 |
readme = fin.read()
|
| 104 |
readme = readme.replace("{{NAME}}", release_stats["name"])
|
|
|
|
| 105 |
|
| 106 |
locales = sorted(release_stats["locales"].keys())
|
| 107 |
languages = [f"- {loc}" for loc in locales]
|
|
|
|
| 54 |
"name": "common_voice_10_0",
|
| 55 |
"release": "cv-corpus-10.0-2022-07-04",
|
| 56 |
},
|
| 57 |
+
{
|
| 58 |
+
"semver": "11.0.0",
|
| 59 |
+
"name": "common_voice_11_0",
|
| 60 |
+
"release": "cv-corpus-11.0-2022-09-21",
|
| 61 |
+
},
|
| 62 |
]
|
| 63 |
|
| 64 |
|
|
|
|
| 107 |
with open(f"README.template.md", "r") as fin:
|
| 108 |
readme = fin.read()
|
| 109 |
readme = readme.replace("{{NAME}}", release_stats["name"])
|
| 110 |
+
readme = readme.replace("{{DATASET_PATH}}", version["name"])
|
| 111 |
|
| 112 |
locales = sorted(release_stats["locales"].keys())
|
| 113 |
languages = [f"- {loc}" for loc in locales]
|
languages.ftl
CHANGED
|
@@ -49,6 +49,7 @@ gom = Goan Konkani
|
|
| 49 |
ha = Hausa
|
| 50 |
he = Hebrew
|
| 51 |
hi = Hindi
|
|
|
|
| 52 |
hr = Croatian
|
| 53 |
hsb = Sorbian, Upper
|
| 54 |
ht = Haitian
|
|
@@ -63,6 +64,7 @@ is = Icelandic
|
|
| 63 |
it = Italian
|
| 64 |
izh = Izhorian
|
| 65 |
ja = Japanese
|
|
|
|
| 66 |
ka = Georgian
|
| 67 |
kaa = Karakalpak
|
| 68 |
kab = Kabyle
|
|
@@ -71,6 +73,7 @@ ki = Kikuyu
|
|
| 71 |
kk = Kazakh
|
| 72 |
km = Khmer
|
| 73 |
kmr = Kurmanji Kurdish
|
|
|
|
| 74 |
knn = Konkani (Devanagari)
|
| 75 |
ko = Korean
|
| 76 |
kpv = Komi-Zyrian
|
|
@@ -79,6 +82,8 @@ ky = Kyrgyz
|
|
| 79 |
lb = Luxembourgish
|
| 80 |
lg = Luganda
|
| 81 |
lij = Ligurian
|
|
|
|
|
|
|
| 82 |
lt = Lithuanian
|
| 83 |
lv = Latvian
|
| 84 |
mai = Maithili
|
|
@@ -125,11 +130,13 @@ sah = Sakha
|
|
| 125 |
sat = Santali (Ol Chiki)
|
| 126 |
sc = Sardinian
|
| 127 |
scn = Sicilian
|
|
|
|
| 128 |
shi = Shilha
|
| 129 |
si = Sinhala
|
| 130 |
sk = Slovak
|
| 131 |
skr = Saraiki
|
| 132 |
sl = Slovenian
|
|
|
|
| 133 |
so = Somali
|
| 134 |
sq = Albanian
|
| 135 |
sr = Serbian
|
|
@@ -167,6 +174,7 @@ xh = Xhosa
|
|
| 167 |
yi = Yiddish
|
| 168 |
yo = Yoruba
|
| 169 |
yue = Cantonese
|
|
|
|
| 170 |
zh-CN = Chinese (China)
|
| 171 |
zh-HK = Chinese (Hong Kong)
|
| 172 |
zh-TW = Chinese (Taiwan)
|
|
|
|
| 49 |
ha = Hausa
|
| 50 |
he = Hebrew
|
| 51 |
hi = Hindi
|
| 52 |
+
hil = Hiligaynon
|
| 53 |
hr = Croatian
|
| 54 |
hsb = Sorbian, Upper
|
| 55 |
ht = Haitian
|
|
|
|
| 64 |
it = Italian
|
| 65 |
izh = Izhorian
|
| 66 |
ja = Japanese
|
| 67 |
+
jbo = Lojban
|
| 68 |
ka = Georgian
|
| 69 |
kaa = Karakalpak
|
| 70 |
kab = Kabyle
|
|
|
|
| 73 |
kk = Kazakh
|
| 74 |
km = Khmer
|
| 75 |
kmr = Kurmanji Kurdish
|
| 76 |
+
kn = Kannada
|
| 77 |
knn = Konkani (Devanagari)
|
| 78 |
ko = Korean
|
| 79 |
kpv = Komi-Zyrian
|
|
|
|
| 82 |
lb = Luxembourgish
|
| 83 |
lg = Luganda
|
| 84 |
lij = Ligurian
|
| 85 |
+
ln = Lingala
|
| 86 |
+
lo = Lao
|
| 87 |
lt = Lithuanian
|
| 88 |
lv = Latvian
|
| 89 |
mai = Maithili
|
|
|
|
| 130 |
sat = Santali (Ol Chiki)
|
| 131 |
sc = Sardinian
|
| 132 |
scn = Sicilian
|
| 133 |
+
sdh = Southern Kurdish
|
| 134 |
shi = Shilha
|
| 135 |
si = Sinhala
|
| 136 |
sk = Slovak
|
| 137 |
skr = Saraiki
|
| 138 |
sl = Slovenian
|
| 139 |
+
snk = Soninke
|
| 140 |
so = Somali
|
| 141 |
sq = Albanian
|
| 142 |
sr = Serbian
|
|
|
|
| 174 |
yi = Yiddish
|
| 175 |
yo = Yoruba
|
| 176 |
yue = Cantonese
|
| 177 |
+
zgh = Tamazight
|
| 178 |
zh-CN = Chinese (China)
|
| 179 |
zh-HK = Chinese (Hong Kong)
|
| 180 |
zh-TW = Chinese (Taiwan)
|
test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
|
| 3 |
-
dataset = load_dataset("
|
| 4 |
print(dataset)
|
| 5 |
print(dataset[100])
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
|
| 3 |
+
dataset = load_dataset("./common_voice_11_0", "et", split="test", use_auth_token=True)
|
| 4 |
print(dataset)
|
| 5 |
print(dataset[100])
|