Initial Commit

Browse files

Files changed (7) hide show

.gitattributes +9 -19
README.md +225 -0
classifier.ckpt +3 -0
embedding_model.ckpt +3 -0
hyperparams.yaml +52 -0
label_encoder.txt +109 -0
normalizer.ckpt +3 -0

.gitattributes CHANGED Viewed

@@ -1,27 +1,17 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
 *.bin.* filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zstandard filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,225 @@

+---
+language: multilingual
+thumbnail:
+tags:
+- audio-classification
+- speechbrain
+- embeddings
+- Language
+- Identification
+- pytorch
+- ECAPA-TDNN
+- TDNN
+- VoxLingua107
+license: "apache-2.0"
+datasets:
+- VoxLingua107
+metrics:
+- Accuracy
+widget:
+- label: English Sample
+  src: https://cdn-media.huggingface.co/speech_samples/LibriSpeech_61-70968-0000.flac
+---
+# VoxLingua107 ECAPA-TDNN Spoken Language Identification Model
+## Model description
+This is a spoken language recognition model trained on the VoxLingua107 dataset using SpeechBrain.
+The model uses the ECAPA-TDNN architecture that has previously been used for speaker recognition.
+The model can classify a speech utterance according to the language spoken.
+It covers 107 different languages (
+Abkhazian,
+Afrikaans,
+Amharic,
+Arabic,
+Assamese,
+Azerbaijani,
+Bashkir,
+Belarusian,
+Bulgarian,
+Bengali,
+Tibetan,
+Breton,
+Bosnian,
+Catalan,
+Cebuano,
+Czech,
+Welsh,
+Danish,
+German,
+Greek,
+English,
+Esperanto,
+Spanish,
+Estonian,
+Basque,
+Persian,
+Finnish,
+Faroese,
+French,
+Galician,
+Guarani,
+Gujarati,
+Manx,
+Hausa,
+Hawaiian,
+Hindi,
+Croatian,
+Haitian,
+Hungarian,
+Armenian,
+Interlingua,
+Indonesian,
+Icelandic,
+Italian,
+Hebrew,
+Japanese,
+Javanese,
+Georgian,
+Kazakh,
+Central Khmer,
+Kannada,
+Korean,
+Latin,
+Luxembourgish,
+Lingala,
+Lao,
+Lithuanian,
+Latvian,
+Malagasy,
+Maori,
+Macedonian,
+Malayalam,
+Mongolian,
+Marathi,
+Malay,
+Maltese,
+Burmese,
+Nepali,
+Dutch,
+Norwegian Nynorsk,
+Norwegian,
+Occitan,
+Panjabi,
+Polish,
+Pushto,
+Portuguese,
+Romanian,
+Russian,
+Sanskrit,
+Scots,
+Sindhi,
+Sinhala,
+Slovak,
+Slovenian,
+Shona,
+Somali,
+Albanian,
+Serbian,
+Sundanese,
+Swedish,
+Swahili,
+Tamil,
+Telugu,
+Tajik,
+Thai,
+Turkmen,
+Tagalog,
+Turkish,
+Tatar,
+Ukrainian,
+Urdu,
+Uzbek,
+Vietnamese,
+Waray,
+Yiddish,
+Yoruba,
+Mandarin Chinese).
+## Intended uses & limitations
+The model has two uses:
+  - use 'as is' for spoken language recognition
+  - use as an utterance-level feature (embedding) extractor, for creating a dedicated language ID model on your own data
+The model is trained on automatically collected YouTube data. For more
+information about the dataset, see [here](http://bark.phon.ioc.ee/voxlingua107/).
+#### How to use
+```python
+import torchaudio
+from speechbrain.pretrained import EncoderClassifier
+language_id = EncoderClassifier.from_hparams(source="TalTechNLP/voxlingua107-epaca-tdnn", savedir="tmp")
+# Download Thai language sample from Omniglot and cvert to suitable form
+signal = language_id.load_audio("https://omniglot.com/soundfiles/udhr/udhr_th.mp3")
+prediction =  language_id.classify_batch(signal)
+print(prediction)
+  (tensor([[0.3210, 0.3751, 0.3680, 0.3939, 0.4026, 0.3644, 0.3689, 0.3597, 0.3508,
+           0.3666, 0.3895, 0.3978, 0.3848, 0.3957, 0.3949, 0.3586, 0.4360, 0.3997,
+           0.4106, 0.3886, 0.4177, 0.3870, 0.3764, 0.3763, 0.3672, 0.4000, 0.4256,
+           0.4091, 0.3563, 0.3695, 0.3320, 0.3838, 0.3850, 0.3867, 0.3878, 0.3944,
+           0.3924, 0.4063, 0.3803, 0.3830, 0.2996, 0.4187, 0.3976, 0.3651, 0.3950,
+           0.3744, 0.4295, 0.3807, 0.3613, 0.4710, 0.3530, 0.4156, 0.3651, 0.3777,
+           0.3813, 0.6063, 0.3708, 0.3886, 0.3766, 0.4023, 0.3785, 0.3612, 0.4193,
+           0.3720, 0.4406, 0.3243, 0.3866, 0.3866, 0.4104, 0.4294, 0.4175, 0.3364,
+           0.3595, 0.3443, 0.3565, 0.3776, 0.3985, 0.3778, 0.2382, 0.4115, 0.4017,
+           0.4070, 0.3266, 0.3648, 0.3888, 0.3907, 0.3755, 0.3631, 0.4460, 0.3464,
+           0.3898, 0.3661, 0.3883, 0.3772, 0.9289, 0.3687, 0.4298, 0.4211, 0.3838,
+           0.3521, 0.3515, 0.3465, 0.4772, 0.4043, 0.3844, 0.3973, 0.4343]]), tensor([0.9289]), tensor([94]), ['th'])
+# The scores in the prediction[0] tensor can be interpreted as cosine scores between
+# the languages and the given utterance (i.e., the larger the better)
+# The identified language ISO code is given in prediction[3]
+print(prediction[3])
+  ['th']
+# Alternatively, use the utterance embedding extractor:
+emb =  language_id.encode_batch(signal)
+print(emb.shape)
+  torch.Size([1, 1, 256])
+```
+#### Limitations and bias
+Since the model is trained on VoxLingua107, it has many limitations and biases, some of which are:
+ - Probably it's accuracy on smaller languages  is quite limited
+ - Probably it works worse on female speech than male speech (because YouTube data includes much more male speech)
+ - Based on subjective experiments, it doesn't work well on speech with a foreign accent
+ - Probably it doesn't work well on children's speech and on persons with speech disorders
+## Training data
+The model is trained on [VoxLingua107](http://bark.phon.ioc.ee/voxlingua107/).
+VoxLingua107 is a speech dataset for training spoken language identification models.
+The dataset consists of short speech segments automatically extracted from YouTube videos and labeled according the language of the video title and description, with some post-processing steps to filter out false positives.
+VoxLingua107 contains data for 107 languages. The total amount of speech in the training set is 6628 hours.
+The average amount of data per language is 62 hours. However, the real amount per language varies a lot. There is also a seperate development set containing 1609 speech segments from 33 languages, validated by at least two volunteers to really contain the given language.
+## Training procedure
+We used [SpeechBrain](https://github.com/speechbrain/speechbrain) to train the model.
+Training recipe will be published soon.
+## Evaluation results
+Error rate: 7% on the development dataset
+### BibTeX entry and citation info
+```bibtex
+@inproceedings{valk2021slt,
+  title={{VoxLingua107}: a Dataset for Spoken Language Recognition},
+  author={J{\"o}rgen Valk and Tanel Alum{\"a}e},
+  booktitle={Proc. IEEE SLT Workshop},
+  year={2021},
+}
+```

classifier.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a70783704ef67dcccd675185f5fb96652b4d0f01b66f67e16281a2c0b1d62bc5
+size 110456

embedding_model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e947c296c59f36de13db8b4e5c120dd4d75c2d90e0b6aab3aa86d23c38fc2a8d
+size 84480206

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+pretrained_path: TalTechNLP/voxlingua107-epaca-tdnn
+# Feature parameters
+n_mels: 60
+left_frames: 0
+right_frames: 0
+deltas: false
+# Number of speakers
+out_n_neurons: 107
+# Functions
+compute_features: !new:speechbrain.lobes.features.Fbank
+  n_mels: 60
+  left_frames: 0
+  right_frames: 0
+  deltas: false
+embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
+  input_size: 60
+  channels: [1024, 1024, 1024, 1024, 3072]
+  kernel_sizes: [5, 3, 3, 3, 1]
+  dilations: [1, 2, 3, 4, 1]
+  attention_channels: 128
+  lin_neurons: 256
+classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
+  input_size: 256
+  out_neurons: !ref <out_n_neurons>
+mean_var_norm: !new:speechbrain.processing.features.InputNormalization
+  norm_type: sentence
+  std_norm: false
+modules:
+    compute_features: !ref <compute_features>
+    mean_var_norm: !ref <mean_var_norm>
+    embedding_model: !ref <embedding_model>
+    classifier: !ref <classifier>
+label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        embedding_model: !ref <embedding_model>
+        classifier: !ref <classifier>
+        label_encoder: !ref <label_encoder>
+    paths:
+        embedding_model: !ref <pretrained_path>/embedding_model.ckpt
+        classifier: !ref <pretrained_path>/classifier.ckpt
+        label_encoder: !ref <pretrained_path>/label_encoder.txt

label_encoder.txt ADDED Viewed

	@@ -0,0 +1,109 @@

+'ab' => 0
+'af' => 1
+'am' => 2
+'ar' => 3
+'as' => 4
+'az' => 5
+'ba' => 6
+'be' => 7
+'bg' => 8
+'bn' => 9
+'bo' => 10
+'br' => 11
+'bs' => 12
+'ca' => 13
+'ceb' => 14
+'cs' => 15
+'cy' => 16
+'da' => 17
+'de' => 18
+'el' => 19
+'en' => 20
+'eo' => 21
+'es' => 22
+'et' => 23
+'eu' => 24
+'fa' => 25
+'fi' => 26
+'fo' => 27
+'fr' => 28
+'gl' => 29
+'gn' => 30
+'gu' => 31
+'gv' => 32
+'ha' => 33
+'haw' => 34
+'hi' => 35
+'hr' => 36
+'ht' => 37
+'hu' => 38
+'hy' => 39
+'ia' => 40
+'id' => 41
+'is' => 42
+'it' => 43
+'iw' => 44
+'ja' => 45
+'jw' => 46
+'ka' => 47
+'kk' => 48
+'km' => 49
+'kn' => 50
+'ko' => 51
+'la' => 52
+'lb' => 53
+'ln' => 54
+'lo' => 55
+'lt' => 56
+'lv' => 57
+'mg' => 58
+'mi' => 59
+'mk' => 60
+'ml' => 61
+'mn' => 62
+'mr' => 63
+'ms' => 64
+'mt' => 65
+'my' => 66
+'ne' => 67
+'nl' => 68
+'nn' => 69
+'no' => 70
+'oc' => 71
+'pa' => 72
+'pl' => 73
+'ps' => 74
+'pt' => 75
+'ro' => 76
+'ru' => 77
+'sa' => 78
+'sco' => 79
+'sd' => 80
+'si' => 81
+'sk' => 82
+'sl' => 83
+'sn' => 84
+'so' => 85
+'sq' => 86
+'sr' => 87
+'su' => 88
+'sv' => 89
+'sw' => 90
+'ta' => 91
+'te' => 92
+'tg' => 93
+'th' => 94
+'tk' => 95
+'tl' => 96
+'tr' => 97
+'tt' => 98
+'uk' => 99
+'ur' => 100
+'uz' => 101
+'vi' => 102
+'war' => 103
+'yi' => 104
+'yo' => 105
+'zh' => 106
+================
+'starting_index' => 0

normalizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99327453c38bd629b7479ea440b8efa59332d636555fa6738f1d3e360d6cad28
+size 1153