asadullah797
/

ssl-semi-multitask

@@ -4,50 +4,12 @@ pipeline_tag: audio-classification
 tags:
 - automatic-speech-recognition
 - emotion-recognition
 - speaker-identification
-language:
-- en
-base_model:
-- facebook/wav2vec2-base
 ---
-Multitask Speech Model with Wav2Vec2
-This repository contains a multitask learning pipeline built on top of Wav2Vec2
-, designed to jointly perform:
-Automatic Speech Recognition (ASR) (character-level CTC loss)
-Speaker Identification
-Emotion Recognition
-The system is trained on a combination of training dataset with parallel data from speech transcriptions, speaker identification and emotion recognition labels.
-📌 Features
-Multitask model (Wav2Vec2MultiTasks) with shared Wav2Vec2 encoder and separate heads for:
-Speech Recognition (CTC)
-Speaker classification
-Emotion classification
-Custom data preprocessing:
-Cleans transcripts (removes punctuation & special characters)
-Converts numbers into words
-Builds a vocabulary and tokenizer
-Filters short/invalid audio
-Training, validation, and test splits with collators for CTC.
-Evaluation metrics:
-Character Error Rate (CER) for character recognition
-Accuracy for speaker and emotion classification

 tags:
 - automatic-speech-recognition
 - emotion-recognition
+- model_hub_mixin
+- pytorch_model_hub_mixin
 - speaker-identification
 ---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Code: https://huggingface.co/asadullah797/ssl-semi-multitask
+- Paper: [More Information Needed]
+- Docs: https://github.com/asadullah797/ssl_semi-multitask/blob/main/README.md

config.json CHANGED Viewed

@@ -1,101 +1,6 @@
 {
-  "_name_or_path": "facebook/wav2vec2-base",
-  "activation_dropout": 0.1,
-  "adapter_kernel_size": 3,
-  "adapter_stride": 2,
-  "add_adapter": false,
-  "apply_spec_augment": true,
-  "architectures": [
-    "Wav2Vec2ForMultiTask"
-  ],
-  "attention_dropout": 0.1,
-  "bos_token_id": 1,
-  "classifier_proj_size": 256,
-  "conv_bias": false,
-  "conv_dim": [
-    512,
-    512,
-    512,
-    512,
-    512,
-    512,
-    512
-  ],
-  "conv_kernel": [
-    10,
-    3,
-    3,
-    3,
-    3,
-    2,
-    2
-  ],
-  "conv_stride": [
-    5,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2
-  ],
-  "ctc_loss_reduction": "mean",
-  "ctc_zero_infinity": false,
-  "d_model": 768,
-  "decoder": {
-    "add_cross_attention": false,
-    "activation_dropout": 0.0,
-    "activation_function": "gelu",
-    "attention_dropout": 0.0,
-    "bos_token_id": 1,
-    "decoder_start_token_id": 2,
-    "dropout": 0.1,
-    "eos_token_id": 2,
-    "hidden_size": 768,
-    "initializer_range": 0.02,
-    "layer_norm_eps": 1e-05,
-    "max_position_embeddings": 512,
-    "num_attention_heads": 12,
-    "num_hidden_layers": 6,
-    "pad_token_id": 0,
-    "vocab_size": 32
-  },
-  "eos_token_id": 2,
-  "feat_extract_activation": "gelu",
-  "feat_extract_norm": "group",
-  "feat_proj_dropout": 0.0,
-  "feat_quantizer_dropout": 0.0,
-  "final_dropout": 0.1,
-  "gradient_checkpointing": true,
-  "hidden_act": "gelu",
-  "hidden_dropout": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "layerdrop": 0.1,
-  "mask_channel_length": 10,
-  "mask_channel_min_space": 1,
-  "mask_channel_other": 0.0,
-  "mask_channel_prob": 0.0,
-  "mask_channel_selection": "static",
-  "mask_feature_length": 10,
-  "mask_feature_min_space": 1,
-  "mask_feature_prob": 0.0,
-  "mask_time_length": 10,
-  "mask_time_min_space": 1,
-  "mask_time_other": 0.0,
-  "mask_time_prob": 0.05,
-  "mask_time_selection": "static",
-  "model_type": "wav2vec2",
-  "num_attention_heads": 12,
   "num_emotions": 14,
-  "num_hidden_layers": 12,
   "num_phonemes": 33,
-  "num_speakers": 373,
-  "pad_token_id": 0,
-  "torch_dtype": "float32",
-  "transformers_version": "4.44.2",
-  "use_weighted_layer_sum": false,
-  "vocab_size": 32
-}

 {
+  "base_model": "facebook/wav2vec2-base",
   "num_emotions": 14,
   "num_phonemes": 33,
+  "num_speakers": 373
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0e55768c14c7f08f8271d7e8eae064c585a1659e910182435fb1c516a8a650f
 size 378804760

 version https://git-lfs.github.com/spec/v1
+oid sha256:964ee7c75c809289269de6079d6557a62244d36358ca42c5f84c68d775b05155
 size 378804760