asadullah797 commited on
Commit
dcf4b2a
·
verified ·
1 Parent(s): 9b852eb

Push model using huggingface_hub.

Browse files
Files changed (3) hide show
  1. README.md +6 -44
  2. config.json +3 -98
  3. model.safetensors +1 -1
README.md CHANGED
@@ -4,50 +4,12 @@ pipeline_tag: audio-classification
4
  tags:
5
  - automatic-speech-recognition
6
  - emotion-recognition
 
 
7
  - speaker-identification
8
- language:
9
- - en
10
- base_model:
11
- - facebook/wav2vec2-base
12
  ---
13
 
14
- Multitask Speech Model with Wav2Vec2
15
-
16
- This repository contains a multitask learning pipeline built on top of Wav2Vec2
17
- , designed to jointly perform:
18
-
19
- Automatic Speech Recognition (ASR) (character-level CTC loss)
20
-
21
- Speaker Identification
22
-
23
- Emotion Recognition
24
-
25
- The system is trained on a combination of training dataset with parallel data from speech transcriptions, speaker identification and emotion recognition labels.
26
-
27
- 📌 Features
28
-
29
- Multitask model (Wav2Vec2MultiTasks) with shared Wav2Vec2 encoder and separate heads for:
30
-
31
- Speech Recognition (CTC)
32
-
33
- Speaker classification
34
-
35
- Emotion classification
36
-
37
- Custom data preprocessing:
38
-
39
- Cleans transcripts (removes punctuation & special characters)
40
-
41
- Converts numbers into words
42
-
43
- Builds a vocabulary and tokenizer
44
-
45
- Filters short/invalid audio
46
-
47
- Training, validation, and test splits with collators for CTC.
48
-
49
- Evaluation metrics:
50
-
51
- Character Error Rate (CER) for character recognition
52
-
53
- Accuracy for speaker and emotion classification
 
4
  tags:
5
  - automatic-speech-recognition
6
  - emotion-recognition
7
+ - model_hub_mixin
8
+ - pytorch_model_hub_mixin
9
  - speaker-identification
 
 
 
 
10
  ---
11
 
12
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
13
+ - Code: https://huggingface.co/asadullah797/ssl-semi-multitask
14
+ - Paper: [More Information Needed]
15
+ - Docs: https://github.com/asadullah797/ssl_semi-multitask/blob/main/README.md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,101 +1,6 @@
1
  {
2
- "_name_or_path": "facebook/wav2vec2-base",
3
- "activation_dropout": 0.1,
4
- "adapter_kernel_size": 3,
5
- "adapter_stride": 2,
6
- "add_adapter": false,
7
- "apply_spec_augment": true,
8
- "architectures": [
9
- "Wav2Vec2ForMultiTask"
10
- ],
11
- "attention_dropout": 0.1,
12
- "bos_token_id": 1,
13
- "classifier_proj_size": 256,
14
- "conv_bias": false,
15
- "conv_dim": [
16
- 512,
17
- 512,
18
- 512,
19
- 512,
20
- 512,
21
- 512,
22
- 512
23
- ],
24
- "conv_kernel": [
25
- 10,
26
- 3,
27
- 3,
28
- 3,
29
- 3,
30
- 2,
31
- 2
32
- ],
33
- "conv_stride": [
34
- 5,
35
- 2,
36
- 2,
37
- 2,
38
- 2,
39
- 2,
40
- 2
41
- ],
42
- "ctc_loss_reduction": "mean",
43
- "ctc_zero_infinity": false,
44
- "d_model": 768,
45
- "decoder": {
46
- "add_cross_attention": false,
47
- "activation_dropout": 0.0,
48
- "activation_function": "gelu",
49
- "attention_dropout": 0.0,
50
- "bos_token_id": 1,
51
- "decoder_start_token_id": 2,
52
- "dropout": 0.1,
53
- "eos_token_id": 2,
54
- "hidden_size": 768,
55
- "initializer_range": 0.02,
56
- "layer_norm_eps": 1e-05,
57
- "max_position_embeddings": 512,
58
- "num_attention_heads": 12,
59
- "num_hidden_layers": 6,
60
- "pad_token_id": 0,
61
- "vocab_size": 32
62
- },
63
- "eos_token_id": 2,
64
- "feat_extract_activation": "gelu",
65
- "feat_extract_norm": "group",
66
- "feat_proj_dropout": 0.0,
67
- "feat_quantizer_dropout": 0.0,
68
- "final_dropout": 0.1,
69
- "gradient_checkpointing": true,
70
- "hidden_act": "gelu",
71
- "hidden_dropout": 0.1,
72
- "hidden_size": 768,
73
- "initializer_range": 0.02,
74
- "intermediate_size": 3072,
75
- "layer_norm_eps": 1e-05,
76
- "layerdrop": 0.1,
77
- "mask_channel_length": 10,
78
- "mask_channel_min_space": 1,
79
- "mask_channel_other": 0.0,
80
- "mask_channel_prob": 0.0,
81
- "mask_channel_selection": "static",
82
- "mask_feature_length": 10,
83
- "mask_feature_min_space": 1,
84
- "mask_feature_prob": 0.0,
85
- "mask_time_length": 10,
86
- "mask_time_min_space": 1,
87
- "mask_time_other": 0.0,
88
- "mask_time_prob": 0.05,
89
- "mask_time_selection": "static",
90
- "model_type": "wav2vec2",
91
- "num_attention_heads": 12,
92
  "num_emotions": 14,
93
- "num_hidden_layers": 12,
94
  "num_phonemes": 33,
95
- "num_speakers": 373,
96
- "pad_token_id": 0,
97
- "torch_dtype": "float32",
98
- "transformers_version": "4.44.2",
99
- "use_weighted_layer_sum": false,
100
- "vocab_size": 32
101
- }
 
1
  {
2
+ "base_model": "facebook/wav2vec2-base",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "num_emotions": 14,
 
4
  "num_phonemes": 33,
5
+ "num_speakers": 373
6
+ }
 
 
 
 
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0e55768c14c7f08f8271d7e8eae064c585a1659e910182435fb1c516a8a650f
3
  size 378804760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:964ee7c75c809289269de6079d6557a62244d36358ca42c5f84c68d775b05155
3
  size 378804760