ESPnet
multilingual
audio
universa
ftshijt commited on
Commit
21a6355
·
1 Parent(s): 7ce6253

Update model

Browse files
README.md ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - universa
6
+ language: multilingual
7
+ datasets:
8
+ - universa_unite
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 universa model
13
+
14
+ ### `espnet/arecho_base_v0`
15
+
16
+ This model was trained by ftshijt using universa_unite recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 69996dc206e556ec48db77b6cc385ff1d32895b3
26
+ pip install -e .
27
+ cd egs2/universa_unite/uni_versa1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/arecho_base_v0
29
+ ```
30
+
31
+
32
+
33
+ ## universa config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ accum_grad: 2
39
+ adapter: lora
40
+ adapter_conf: {}
41
+ allow_multi_rates: false
42
+ allow_variable_data_keys: false
43
+ batch_bins: 1000000
44
+ batch_size: 16
45
+ batch_type: sorted
46
+ best_model_criterion:
47
+ - - train
48
+ - loss
49
+ - min
50
+ - - valid
51
+ - loss
52
+ - min
53
+ - - train
54
+ - acc
55
+ - max
56
+ - - valid
57
+ - acc
58
+ - max
59
+ bpemodel: null
60
+ category_sample_size: 10
61
+ chunk_default_fs: null
62
+ chunk_discard_short_samples: true
63
+ chunk_excluded_key_prefixes: []
64
+ chunk_length: 500
65
+ chunk_max_abs_length: null
66
+ chunk_shift_ratio: 0.5
67
+ cleaner: null
68
+ collect_stats: false
69
+ config: conf/train_aruniversa_wavlm.yaml
70
+ create_graph_in_tensorboard: false
71
+ cudnn_benchmark: false
72
+ cudnn_deterministic: false
73
+ cudnn_enabled: true
74
+ ddp_comm_hook: null
75
+ deepspeed_config: null
76
+ detect_anomaly: false
77
+ dist_backend: nccl
78
+ dist_init_method: env://
79
+ dist_launcher: null
80
+ dist_master_addr: null
81
+ dist_master_port: null
82
+ dist_rank: null
83
+ dist_world_size: null
84
+ distributed: false
85
+ drop_last_iter: false
86
+ dry_run: false
87
+ early_stopping_criterion:
88
+ - valid
89
+ - loss
90
+ - min
91
+ exclude_weight_decay: false
92
+ exclude_weight_decay_conf: {}
93
+ fold_length:
94
+ - 256000
95
+ freeze_param:
96
+ - frontend.upstream
97
+ frontend: s3prl
98
+ frontend_conf:
99
+ download_dir: ./hub
100
+ frontend_conf:
101
+ upstream: wavlm_large
102
+ multilayer_feature: true
103
+ g2p: null
104
+ grad_clip: -1
105
+ grad_clip_type: 2.0
106
+ grad_noise: false
107
+ gradient_as_bucket_view: true
108
+ ignore_init_mismatch: false
109
+ init: null
110
+ init_param: []
111
+ iterator_type: sequence
112
+ keep_nbest_models: 1
113
+ local_rank: 0
114
+ log_interval: 50
115
+ log_level: INFO
116
+ max_cache_fd: 32
117
+ max_cache_size: 0.0
118
+ max_epoch: 100
119
+ metric2id: dump/raw/overall_base/metric2id
120
+ metric2type: dump/raw/overall_base/metric2type
121
+ metric_pad_value: -100
122
+ metric_token_info: data/token_list/metric_500_percentile_overall_base_w-numerical/tokens.json
123
+ metric_token_pad_value: 0
124
+ model_conf: {}
125
+ multi_task_dataset: false
126
+ multiple_iterator: false
127
+ multiprocessing_distributed: false
128
+ nbest_averaging_interval: 0
129
+ ngpu: 1
130
+ no_forward_run: false
131
+ non_linguistic_symbols: null
132
+ num_att_plot: 0
133
+ num_cache_chunks: 1024
134
+ num_iters_per_epoch: null
135
+ num_workers: 1
136
+ optim: adamw
137
+ optim_conf:
138
+ lr: 0.001
139
+ output_dir: exp/universa_universa_ar_overall_base_token_wavlm
140
+ patience: null
141
+ pretrain_path: null
142
+ print_config: false
143
+ randomize_sequential_metric: true
144
+ required:
145
+ - output_dir
146
+ - metric2id
147
+ resume: true
148
+ save_strategy: all
149
+ scheduler: warmuplr
150
+ scheduler_conf:
151
+ warmup_steps: 25000
152
+ seed: 777
153
+ sequential_metric: true
154
+ sharded_ddp: false
155
+ shuffle_within_batch: false
156
+ sort_batch: descending
157
+ sort_in_batch: descending
158
+ token_list: null
159
+ token_type: bpe
160
+ tokenize_numerical_metric: true
161
+ train_data_path_and_name_and_type:
162
+ - - dump/raw/overall_base/wav.scp
163
+ - audio
164
+ - kaldi_ark
165
+ - - dump/raw/overall_base/metric.scp
166
+ - metrics
167
+ - metric
168
+ - - dump/raw/overall_base/ref_wav.scp
169
+ - ref_audio
170
+ - kaldi_ark
171
+ train_dtype: float32
172
+ train_shape_file:
173
+ - exp/universa_stats_overall_base/train/audio_shape
174
+ - exp/universa_stats_overall_base/train/ref_audio_shape
175
+ universa: ar_universa
176
+ universa_conf:
177
+ audio_encoder_params:
178
+ attention_dropout_rate: 0.1
179
+ attention_heads: 4
180
+ concat_after: false
181
+ dropout_rate: 0.1
182
+ input_layer: conv2d
183
+ layer_drop_rate: 0.1
184
+ linear_units: 1024
185
+ normalize_before: true
186
+ num_blocks: 4
187
+ positional_dropout_rate: 0.1
188
+ positionwise_conv_kernel_size: 1
189
+ positionwise_layer_type: linear
190
+ qk_norm: false
191
+ use_flash_attn: false
192
+ audio_encoder_type: transformer
193
+ cross_attention_params:
194
+ dropout_rate: 0.1
195
+ n_head: 2
196
+ cross_attention_type: multihead
197
+ embedding_dim: 256
198
+ lsm_weight: 0.1
199
+ metric_decoder_params:
200
+ attention_heads: 4
201
+ concat_after: false
202
+ dropout_rate: 0.1
203
+ input_layer: embed
204
+ layer_drop_rate: 0.1
205
+ linear_units: 1024
206
+ normalize_before: true
207
+ num_blocks: 4
208
+ positional_dropout_rate: 0.1
209
+ qk_norm: false
210
+ self_attention_dropout_rate: 0.1
211
+ src_attention_dropout_rate: 0.1
212
+ use_flash_attn: false
213
+ sym_eos: <eos>
214
+ sym_sos: <sos>
215
+ use_rope: true
216
+ unused_parameters: false
217
+ use_adapter: false
218
+ use_amp: false
219
+ use_deepspeed: false
220
+ use_matplotlib: true
221
+ use_preprocessor: true
222
+ use_ref_audio: true
223
+ use_ref_text: false
224
+ use_tensorboard: true
225
+ use_tf32: false
226
+ use_wandb: false
227
+ val_scheduler_criterion:
228
+ - valid
229
+ - loss
230
+ valid_batch_bins: null
231
+ valid_batch_size: null
232
+ valid_batch_type: null
233
+ valid_data_path_and_name_and_type:
234
+ - - dump/raw/overall_dev/wav.scp
235
+ - audio
236
+ - kaldi_ark
237
+ - - dump/raw/overall_dev/metric.scp
238
+ - metrics
239
+ - metric
240
+ - - dump/raw/overall_dev/ref_wav.scp
241
+ - ref_audio
242
+ - kaldi_ark
243
+ valid_iterator_type: null
244
+ valid_max_cache_size: null
245
+ valid_shape_file:
246
+ - exp/universa_stats_overall_base/valid/audio_shape
247
+ - exp/universa_stats_overall_base/valid/ref_audio_shape
248
+ version: '202503'
249
+ wandb_entity: null
250
+ wandb_id: null
251
+ wandb_model_log_interval: -1
252
+ wandb_name: null
253
+ wandb_project: null
254
+ write_collected_feats: false
255
+ ```
256
+
257
+ </details>
258
+
259
+
260
+
261
+ ### Citing ESPnet
262
+
263
+ ```BibTex
264
+ @inproceedings{watanabe2018espnet,
265
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
266
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
267
+ year={2018},
268
+ booktitle={Proceedings of Interspeech},
269
+ pages={2207--2211},
270
+ doi={10.21437/Interspeech.2018-1456},
271
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
272
+ }
273
+
274
+
275
+
276
+
277
+
278
+
279
+ ```
280
+
281
+ or arXiv:
282
+
283
+ ```bibtex
284
+ @misc{watanabe2018espnet,
285
+ title={ESPnet: End-to-End Speech Processing Toolkit},
286
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
287
+ year={2018},
288
+ eprint={1804.00015},
289
+ archivePrefix={arXiv},
290
+ primaryClass={cs.CL}
291
+ }
292
+ ```
dump/raw/overall_base/metric2id ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ srmr
2
+ language
3
+ nisqa_mos_pred
4
+ nisqa_noi_pred
5
+ nisqa_dis_pred
6
+ nisqa_col_pred
7
+ nisqa_loud_pred
8
+ sheet_ssqa
9
+ utmos
10
+ utmosv2
11
+ dns_overall
12
+ dns_p808
13
+ plcmos
14
+ singmos
15
+ scoreq_nr
16
+ se_sdr
17
+ se_sar
18
+ se_si_snr
19
+ se_ci_sdr
20
+ pam_score
21
+ speaking_rate
22
+ audiobox_aesthetics_CE
23
+ audiobox_aesthetics_CU
24
+ audiobox_aesthetics_PC
25
+ audiobox_aesthetics_PQ
26
+ asvspoof_score
27
+ real_language
28
+ qwen_speaker_count
29
+ qwen_speaker_gender
30
+ qwen_speaker_age
31
+ qwen_speech_impairment
32
+ qwen_voice_pitch
33
+ qwen_pitch_range
34
+ qwen_voice_type
35
+ qwen_speech_volume_level
36
+ qwen_language
37
+ qwen_speech_register
38
+ qwen_vocabulary_complexity
39
+ qwen_speech_purpose
40
+ qwen_speech_emotion
41
+ qwen_speech_clarity
42
+ qwen_speech_rate
43
+ qwen_speaking_style
44
+ qwen_laughter_crying
45
+ qwen_speech_background_environment
46
+ qwen_recording_quality
47
+ qwen_channel_type
48
+ snr_simulation
49
+ rir_room_size
50
+ nomad
51
+ emotion_similarity
52
+ noresqa_score
53
+ speech_bert
54
+ speech_bleu
55
+ speech_token_distance
56
+ scoreq_ref
57
+ asr_match_error_rate
58
+ ref_text_length
59
+ pred_text_length
60
+ spk_similarity
61
+ rt60
62
+ visqol
63
+ pysepm_fwsegsnr
64
+ pysepm_llr
65
+ pysepm_wss
66
+ pysepm_cd
67
+ pysepm_c_sig
68
+ pysepm_c_bak
69
+ pysepm_c_ovl
70
+ pysepm_csii_high
71
+ pysepm_csii_mid
72
+ pysepm_csii_low
73
+ pysepm_ncm
74
+ mcd
75
+ f0rmse
76
+ f0corr
77
+ pesq
78
+ stoi
79
+ sdr
80
+ sar
81
+ si_snr
82
+ ci_sdr
83
+ nisqa_real_mos
84
+ wer
85
+ cer
86
+ urgent_mos
87
+ voicemos_real_mos
dump/raw/overall_base/metric2type ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ srmr numerical
2
+ language categorical
3
+ nisqa_mos_pred numerical
4
+ nisqa_noi_pred numerical
5
+ nisqa_dis_pred numerical
6
+ nisqa_col_pred numerical
7
+ nisqa_loud_pred numerical
8
+ sheet_ssqa numerical
9
+ utmos numerical
10
+ utmosv2 numerical
11
+ dns_overall numerical
12
+ dns_p808 numerical
13
+ plcmos numerical
14
+ singmos numerical
15
+ scoreq_nr numerical
16
+ se_sdr numerical
17
+ se_sar numerical
18
+ se_si_snr numerical
19
+ se_ci_sdr numerical
20
+ pam_score numerical
21
+ speaking_rate numerical
22
+ audiobox_aesthetics_CE numerical
23
+ audiobox_aesthetics_CU numerical
24
+ audiobox_aesthetics_PC numerical
25
+ audiobox_aesthetics_PQ numerical
26
+ asvspoof_score numerical
27
+ real_language categorical
28
+ qwen_speaker_count numerical
29
+ qwen_speaker_gender categorical
30
+ qwen_speaker_age categorical
31
+ qwen_speech_impairment categorical
32
+ qwen_voice_pitch categorical
33
+ qwen_pitch_range categorical
34
+ qwen_voice_type categorical
35
+ qwen_speech_volume_level categorical
36
+ qwen_language categorical
37
+ qwen_speech_register categorical
38
+ qwen_vocabulary_complexity categorical
39
+ qwen_speech_purpose categorical
40
+ qwen_speech_emotion categorical
41
+ qwen_speech_clarity categorical
42
+ qwen_speech_rate categorical
43
+ qwen_speaking_style categorical
44
+ qwen_laughter_crying categorical
45
+ qwen_speech_background_environment categorical
46
+ qwen_recording_quality categorical
47
+ qwen_channel_type categorical
48
+ snr_simulation numerical
49
+ rir_room_size categorical
50
+ nomad numerical
51
+ emotion_similarity numerical
52
+ noresqa_score numerical
53
+ speech_bert numerical
54
+ speech_bleu numerical
55
+ speech_token_distance numerical
56
+ scoreq_ref numerical
57
+ asr_match_error_rate numerical
58
+ ref_text_length numerical
59
+ pred_text_length numerical
60
+ spk_similarity numerical
61
+ rt60 numerical
62
+ visqol numerical
63
+ pysepm_fwsegsnr numerical
64
+ pysepm_llr numerical
65
+ pysepm_wss numerical
66
+ pysepm_cd numerical
67
+ pysepm_c_sig numerical
68
+ pysepm_c_bak numerical
69
+ pysepm_c_ovl numerical
70
+ pysepm_csii_high numerical
71
+ pysepm_csii_mid numerical
72
+ pysepm_csii_low numerical
73
+ pysepm_ncm numerical
74
+ mcd numerical
75
+ f0rmse numerical
76
+ f0corr numerical
77
+ pesq numerical
78
+ stoi numerical
79
+ sdr numerical
80
+ sar numerical
81
+ si_snr numerical
82
+ ci_sdr numerical
83
+ nisqa_real_mos numerical
84
+ wer numerical
85
+ cer numerical
86
+ urgent_mos numerical
87
+ voicemos_real_mos numerical
exp/universa_universa_ar_overall_base_token_wavlm/68epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f41bc8ff2bac76b3631f39f554b7662e881c7c8bf05d08d3d484f59760f9b81
3
+ size 2325170198
exp/universa_universa_ar_overall_base_token_wavlm/config.yaml ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_grad: 2
2
+ adapter: lora
3
+ adapter_conf: {}
4
+ allow_multi_rates: false
5
+ allow_variable_data_keys: false
6
+ batch_bins: 1000000
7
+ batch_size: 16
8
+ batch_type: sorted
9
+ best_model_criterion:
10
+ - - train
11
+ - loss
12
+ - min
13
+ - - valid
14
+ - loss
15
+ - min
16
+ - - train
17
+ - acc
18
+ - max
19
+ - - valid
20
+ - acc
21
+ - max
22
+ bpemodel: null
23
+ category_sample_size: 10
24
+ chunk_default_fs: null
25
+ chunk_discard_short_samples: true
26
+ chunk_excluded_key_prefixes: []
27
+ chunk_length: 500
28
+ chunk_max_abs_length: null
29
+ chunk_shift_ratio: 0.5
30
+ cleaner: null
31
+ collect_stats: false
32
+ config: conf/train_aruniversa_wavlm.yaml
33
+ create_graph_in_tensorboard: false
34
+ cudnn_benchmark: false
35
+ cudnn_deterministic: false
36
+ cudnn_enabled: true
37
+ ddp_comm_hook: null
38
+ deepspeed_config: null
39
+ detect_anomaly: false
40
+ dist_backend: nccl
41
+ dist_init_method: env://
42
+ dist_launcher: null
43
+ dist_master_addr: null
44
+ dist_master_port: null
45
+ dist_rank: null
46
+ dist_world_size: null
47
+ distributed: false
48
+ drop_last_iter: false
49
+ dry_run: false
50
+ early_stopping_criterion:
51
+ - valid
52
+ - loss
53
+ - min
54
+ exclude_weight_decay: false
55
+ exclude_weight_decay_conf: {}
56
+ fold_length:
57
+ - 256000
58
+ freeze_param:
59
+ - frontend.upstream
60
+ frontend: s3prl
61
+ frontend_conf:
62
+ download_dir: ./hub
63
+ frontend_conf:
64
+ upstream: wavlm_large
65
+ multilayer_feature: true
66
+ g2p: null
67
+ grad_clip: -1
68
+ grad_clip_type: 2.0
69
+ grad_noise: false
70
+ gradient_as_bucket_view: true
71
+ ignore_init_mismatch: false
72
+ init: null
73
+ init_param: []
74
+ iterator_type: sequence
75
+ keep_nbest_models: 1
76
+ local_rank: 0
77
+ log_interval: 50
78
+ log_level: INFO
79
+ max_cache_fd: 32
80
+ max_cache_size: 0.0
81
+ max_epoch: 100
82
+ metric2id: dump/raw/overall_base/metric2id
83
+ metric2type: dump/raw/overall_base/metric2type
84
+ metric_pad_value: -100
85
+ metric_token_info: data/token_list/metric_500_percentile_overall_base_w-numerical/tokens.json
86
+ metric_token_pad_value: 0
87
+ model_conf: {}
88
+ multi_task_dataset: false
89
+ multiple_iterator: false
90
+ multiprocessing_distributed: false
91
+ nbest_averaging_interval: 0
92
+ ngpu: 1
93
+ no_forward_run: false
94
+ non_linguistic_symbols: null
95
+ num_att_plot: 0
96
+ num_cache_chunks: 1024
97
+ num_iters_per_epoch: null
98
+ num_workers: 1
99
+ optim: adamw
100
+ optim_conf:
101
+ lr: 0.001
102
+ output_dir: exp/universa_universa_ar_overall_base_token_wavlm
103
+ patience: null
104
+ pretrain_path: null
105
+ print_config: false
106
+ randomize_sequential_metric: true
107
+ required:
108
+ - output_dir
109
+ - metric2id
110
+ resume: true
111
+ save_strategy: all
112
+ scheduler: warmuplr
113
+ scheduler_conf:
114
+ warmup_steps: 25000
115
+ seed: 777
116
+ sequential_metric: true
117
+ sharded_ddp: false
118
+ shuffle_within_batch: false
119
+ sort_batch: descending
120
+ sort_in_batch: descending
121
+ token_list: null
122
+ token_type: bpe
123
+ tokenize_numerical_metric: true
124
+ train_data_path_and_name_and_type:
125
+ - - dump/raw/overall_base/wav.scp
126
+ - audio
127
+ - kaldi_ark
128
+ - - dump/raw/overall_base/metric.scp
129
+ - metrics
130
+ - metric
131
+ - - dump/raw/overall_base/ref_wav.scp
132
+ - ref_audio
133
+ - kaldi_ark
134
+ train_dtype: float32
135
+ train_shape_file:
136
+ - exp/universa_stats_overall_base/train/audio_shape
137
+ - exp/universa_stats_overall_base/train/ref_audio_shape
138
+ universa: ar_universa
139
+ universa_conf:
140
+ audio_encoder_params:
141
+ attention_dropout_rate: 0.1
142
+ attention_heads: 4
143
+ concat_after: false
144
+ dropout_rate: 0.1
145
+ input_layer: conv2d
146
+ layer_drop_rate: 0.1
147
+ linear_units: 1024
148
+ normalize_before: true
149
+ num_blocks: 4
150
+ positional_dropout_rate: 0.1
151
+ positionwise_conv_kernel_size: 1
152
+ positionwise_layer_type: linear
153
+ qk_norm: false
154
+ use_flash_attn: false
155
+ audio_encoder_type: transformer
156
+ cross_attention_params:
157
+ dropout_rate: 0.1
158
+ n_head: 2
159
+ cross_attention_type: multihead
160
+ embedding_dim: 256
161
+ lsm_weight: 0.1
162
+ metric_decoder_params:
163
+ attention_heads: 4
164
+ concat_after: false
165
+ dropout_rate: 0.1
166
+ input_layer: embed
167
+ layer_drop_rate: 0.1
168
+ linear_units: 1024
169
+ normalize_before: true
170
+ num_blocks: 4
171
+ positional_dropout_rate: 0.1
172
+ qk_norm: false
173
+ self_attention_dropout_rate: 0.1
174
+ src_attention_dropout_rate: 0.1
175
+ use_flash_attn: false
176
+ sym_eos: <eos>
177
+ sym_sos: <sos>
178
+ use_rope: true
179
+ unused_parameters: false
180
+ use_adapter: false
181
+ use_amp: false
182
+ use_deepspeed: false
183
+ use_matplotlib: true
184
+ use_preprocessor: true
185
+ use_ref_audio: true
186
+ use_ref_text: false
187
+ use_tensorboard: true
188
+ use_tf32: false
189
+ use_wandb: false
190
+ val_scheduler_criterion:
191
+ - valid
192
+ - loss
193
+ valid_batch_bins: null
194
+ valid_batch_size: null
195
+ valid_batch_type: null
196
+ valid_data_path_and_name_and_type:
197
+ - - dump/raw/overall_dev/wav.scp
198
+ - audio
199
+ - kaldi_ark
200
+ - - dump/raw/overall_dev/metric.scp
201
+ - metrics
202
+ - metric
203
+ - - dump/raw/overall_dev/ref_wav.scp
204
+ - ref_audio
205
+ - kaldi_ark
206
+ valid_iterator_type: null
207
+ valid_max_cache_size: null
208
+ valid_shape_file:
209
+ - exp/universa_stats_overall_base/valid/audio_shape
210
+ - exp/universa_stats_overall_base/valid/ref_audio_shape
211
+ version: '202503'
212
+ wandb_entity: null
213
+ wandb_id: null
214
+ wandb_model_log_interval: -1
215
+ wandb_name: null
216
+ wandb_project: null
217
+ write_collected_feats: false
exp/universa_universa_ar_overall_base_token_wavlm/images/acc_ar_decoder.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/backward_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/clip.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/forward_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/gpu_max_cached_mem_GB.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/grad_norm.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/iter_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/loss.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/loss_ar_decoder.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/loss_scale.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/optim0_lr0.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/optim_step_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/train_time.png ADDED
exp/universa_universa_ar_overall_base_token_wavlm/images/value_ar_decoder.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ model_file: exp/universa_universa_ar_overall_base_token_wavlm/68epoch.pth
4
+ python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
5
+ timestamp: 1749800767.145012
6
+ torch: 2.6.0.dev20241210+cu124
7
+ yaml_files:
8
+ train_config: exp/universa_universa_ar_overall_base_token_wavlm/config.yaml