File size: 4,691 Bytes
ad89d98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# ############################################################################
# Model: ECAPA big for Speaker verification
# ############################################################################
# Feature parameters
n_mels: 80
# Pretrain folder (HuggingFace)
# pretrained_path: poonehmousavi/discrete_wavlm_spk_rec_ecapatdn
pretrained_path: benchmarks/DASB/VoiceCeleb1/speaker_ver/temp
# Output parameters
out_n_neurons: 1211
save_folder: tmp
### Configuration for discrete SSL model
# ssl_model_type: hubert, wavlm, wav2vec2
# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large
ssl_model_type: wavlm # hubert, wavml or wav2vec2
ssl_hub: microsoft/wavlm-large
ssl_folder: !ref <save_folder>/ssl_checkpoint
kmeans_repo_id: speechbrain/SSL_Quantization
kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
kmeans_dataset: LibriSpeech-100-360-500
freeze_ssl: True
freeze_feature_extractor: True
num_clusters: 1000
### Config for Tokenizer
# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer)
# ssl_layer_num: [3, 7, 12, 23]
# deduplicate: [False, False, False, False]
# bpe_tokenizer_path: [null , null, null, null]
ssl_layer_num: [1, 3, 7, 12, 18, 23]
num_codebooks: 6
deduplicate: [False, False, False, False, False, False]
bpe_tokenizer_path: [null, null, null, null, null, null]
sample_rate: 16000
# Feature parameters
encoder_dim: 1024
# Modules
tokenizer_config:
SSL_layers: !ref <ssl_layer_num>
deduplicates: !ref <deduplicate>
bpe_tokenizers: !ref <bpe_tokenizer_path>
ssl_model: !apply:speechbrain.utils.hparams.choice
value: !ref <ssl_model_type>
choices:
wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
source: !ref <ssl_hub>
output_norm: False
freeze: !ref <freeze_ssl>
freeze_feature_extractor: !ref <freeze_feature_extractor>
output_all_hiddens: True
save_path: !ref <ssl_folder>
hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
source: !ref <ssl_hub>
output_norm: False
freeze: !ref <freeze_ssl>
freeze_feature_extractor: !ref <freeze_feature_extractor>
output_all_hiddens: True
save_path: !ref <ssl_folder>
wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
source: !ref <ssl_hub>
output_norm: False
freeze: !ref <freeze_ssl>
freeze_feature_extractor: !ref <freeze_feature_extractor>
output_all_hiddens: True
save_path: !ref <ssl_folder>
codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
save_path: !ref <kmeans_cache_dir>
ssl_model: !ref <ssl_model>
kmeans_dataset: !ref <kmeans_dataset>
kmeans_repo_id: !ref <kmeans_repo_id>
num_clusters: !ref <num_clusters>
discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
num_codebooks: !ref <num_codebooks>
vocab_size: !ref <num_clusters>
emb_dim: !ref <encoder_dim>
attention_mlp: !new:custom_model.AttentionMLP
input_dim: !ref <encoder_dim>
hidden_dim: !ref <encoder_dim>
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
input_size: !ref <encoder_dim>
channels: [1024, 1024, 1024, 1024, 3072]
kernel_sizes: [5, 3, 3, 3, 1]
dilations: [1, 2, 3, 4, 1]
groups: [1, 1, 1, 1, 1]
attention_channels: 128
lin_neurons: 192
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
input_size: 192
out_neurons: !ref <out_n_neurons>
modules:
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
attention_mlp: !ref <attention_mlp>
codec: !ref <codec>
discrete_embedding_layer: !ref <discrete_embedding_layer>
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
attention_mlp: !ref <attention_mlp>
discrete_embedding_layer: !ref <discrete_embedding_layer>
label_encoder: !ref <label_encoder>
paths:
embedding_model: !ref <pretrained_path>/embedding_model.ckpt
classifier: !ref <pretrained_path>/classifier.ckpt
attention_mlp: !ref <pretrained_path>/attention_mlp.ckpt
label_encoder: !ref <pretrained_path>/label_encoder.txt
discrete_embedding_layer: !ref <pretrained_path>/discrete_embedding_layer.ckpt
|