speechbrain
/

asr-streaming-conformer-librispeech

@@ -10,31 +10,6 @@
 save_folder: !ref librispeech-streaming-conformer-transducer
-# Training parameters
-# To make Transformers converge, the global bath size should be large enough.
-# The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor.
-# Empirically, we found that this value should be >= 128.
-# Please, set your parameters accordingly.
-number_of_epochs: 50
-warmup_steps: 25000
-num_workers: 4
-batch_size_valid: 4
-lr: 0.0008
-weight_decay: 0.01
-number_of_ctc_epochs: 40
-ctc_weight: 0.3 # Multitask with CTC for the encoder (0.0 = disabled)
-ce_weight: 0.0 # Multitask with CE for the decoder (0.0 = disabled)
-max_grad_norm: 5.0
-loss_reduction: 'batchmean'
-precision: fp32 # bf16, fp16 or fp32
-# The batch size is used if and only if dynamic batching is set to False
-# Validation and testing are done with fixed batches and not dynamic batching.
-batch_size: 8
-grad_accumulation_factor: 4
-sorting: ascending
-avg_checkpoints: 10 # Number of checkpoints to average for evaluation
 # Feature parameters
 sample_rate: 16000
 n_fft: 512
@@ -44,22 +19,6 @@ win_length: 32
 # Streaming
 streaming: True  # controls all Dynamic Chunk Training & chunk size & left context mechanisms
-# This setup works well for 3090 24GB GPU, adapt it to your needs.
-# Adjust grad_accumulation_factor depending on the DDP node count (here 3)
-# Or turn it off (but training speed will decrease)
-dynamic_batching: True
-max_batch_len: 250
-max_batch_len_val: 50 # we reduce it as the beam is much wider (VRAM)
-num_bucket: 200
-dynamic_batch_sampler:
-   max_batch_len: !ref <max_batch_len>
-   max_batch_len_val: !ref <max_batch_len_val>
-   num_buckets: !ref <num_bucket>
-   shuffle_ex: True # if true re-creates batches at each epoch shuffling examples.
-   batch_ordering: random
-   max_batch_ex: 256
 # Model parameters
 # Transformer
 d_model: 512
@@ -88,9 +47,6 @@ state_beam: 2.3
 expand_beam: 2.3
 lm_weight: 0.50
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
 normalize: !new:speechbrain.processing.features.InputNormalization
    norm_type: global
    update_until_epoch: 4
@@ -146,11 +102,6 @@ proj_dec: !new:speechbrain.nnet.linear.Linear
    n_neurons: !ref <joint_dim>
    bias: False
-# Uncomment for MTL with CTC
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-   reduction: !ref <loss_reduction>
 emb: !new:speechbrain.nnet.embedding.Embedding
    num_embeddings: !ref <output_neurons>
    consider_as_one_hot: True

 save_folder: !ref librispeech-streaming-conformer-transducer
 # Feature parameters
 sample_rate: 16000
 n_fft: 512
 # Streaming
 streaming: True  # controls all Dynamic Chunk Training & chunk size & left context mechanisms
 # Model parameters
 # Transformer
 d_model: 512
 expand_beam: 2.3
 lm_weight: 0.50
 normalize: !new:speechbrain.processing.features.InputNormalization
    norm_type: global
    update_until_epoch: 4
    n_neurons: !ref <joint_dim>
    bias: False
 emb: !new:speechbrain.nnet.embedding.Embedding
    num_embeddings: !ref <output_neurons>
    consider_as_one_hot: True