Washere-1 commited on
Commit
aad69d6
·
verified ·
1 Parent(s): 51c81f3

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.53.2",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.53.2"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:310eb69143459d186c7f695497e573cf37c61f57082511bf9f63f513fdfa5aab
3
+ size 577789320
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41738eed34a1e48060ea60b952169c462d1ee8e66e4134d8943c21310081ca5e
3
+ size 1155772233
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "SpeechT5FeatureExtractor",
4
+ "feature_size": 1,
5
+ "fmax": 7600,
6
+ "fmin": 80,
7
+ "frame_signal_scale": 1.0,
8
+ "hop_length": 16,
9
+ "mel_floor": 1e-10,
10
+ "num_mel_bins": 80,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "processor_class": "SpeechT5Processor",
14
+ "reduction_factor": 2,
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_function": "hann_window",
18
+ "win_length": 64
19
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23731a1431e441fa2babf1b269e657d01b317504056ab030399bb7758c6f52fa
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6a1e3232c9dd0558d3b582345b4a537ca2d63d9ae2df7d7189cbdc92c4e384f
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }
trainer_state.json ADDED
@@ -0,0 +1,914 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1350,
3
+ "best_metric": 0.36149922013282776,
4
+ "best_model_checkpoint": "trainer_output/checkpoint-400",
5
+ "epoch": 22.22284122562674,
6
+ "eval_steps": 50,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.2785515320334262,
14
+ "grad_norm": 3.268444776535034,
15
+ "learning_rate": 2.4e-05,
16
+ "loss": 0.4348,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.5571030640668524,
21
+ "grad_norm": 4.304715633392334,
22
+ "learning_rate": 4.9e-05,
23
+ "loss": 0.4346,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.5571030640668524,
28
+ "eval_loss": 0.3896651268005371,
29
+ "eval_runtime": 13.8517,
30
+ "eval_samples_per_second": 23.102,
31
+ "eval_steps_per_second": 23.102,
32
+ "step": 50
33
+ },
34
+ {
35
+ "epoch": 0.8356545961002786,
36
+ "grad_norm": 8.992396354675293,
37
+ "learning_rate": 7.4e-05,
38
+ "loss": 0.4365,
39
+ "step": 75
40
+ },
41
+ {
42
+ "epoch": 1.1114206128133706,
43
+ "grad_norm": 4.0646514892578125,
44
+ "learning_rate": 9.900000000000001e-05,
45
+ "loss": 0.4438,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 1.1114206128133706,
50
+ "eval_loss": 0.41089218854904175,
51
+ "eval_runtime": 14.0072,
52
+ "eval_samples_per_second": 22.845,
53
+ "eval_steps_per_second": 22.845,
54
+ "step": 100
55
+ },
56
+ {
57
+ "epoch": 1.3899721448467965,
58
+ "grad_norm": 3.1883151531219482,
59
+ "learning_rate": 9.873684210526316e-05,
60
+ "loss": 0.4411,
61
+ "step": 125
62
+ },
63
+ {
64
+ "epoch": 1.668523676880223,
65
+ "grad_norm": 10.366960525512695,
66
+ "learning_rate": 9.742105263157896e-05,
67
+ "loss": 0.455,
68
+ "step": 150
69
+ },
70
+ {
71
+ "epoch": 1.668523676880223,
72
+ "eval_loss": 0.39505237340927124,
73
+ "eval_runtime": 13.684,
74
+ "eval_samples_per_second": 23.385,
75
+ "eval_steps_per_second": 23.385,
76
+ "step": 150
77
+ },
78
+ {
79
+ "epoch": 1.947075208913649,
80
+ "grad_norm": 5.3184027671813965,
81
+ "learning_rate": 9.610526315789474e-05,
82
+ "loss": 0.4315,
83
+ "step": 175
84
+ },
85
+ {
86
+ "epoch": 2.222841225626741,
87
+ "grad_norm": 7.163997173309326,
88
+ "learning_rate": 9.478947368421053e-05,
89
+ "loss": 0.4243,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 2.222841225626741,
94
+ "eval_loss": 0.42486143112182617,
95
+ "eval_runtime": 13.6879,
96
+ "eval_samples_per_second": 23.378,
97
+ "eval_steps_per_second": 23.378,
98
+ "step": 200
99
+ },
100
+ {
101
+ "epoch": 2.501392757660167,
102
+ "grad_norm": 4.934960842132568,
103
+ "learning_rate": 9.347368421052632e-05,
104
+ "loss": 0.4417,
105
+ "step": 225
106
+ },
107
+ {
108
+ "epoch": 2.779944289693593,
109
+ "grad_norm": 1.8761239051818848,
110
+ "learning_rate": 9.21578947368421e-05,
111
+ "loss": 0.4322,
112
+ "step": 250
113
+ },
114
+ {
115
+ "epoch": 2.779944289693593,
116
+ "eval_loss": 0.381044864654541,
117
+ "eval_runtime": 13.7455,
118
+ "eval_samples_per_second": 23.28,
119
+ "eval_steps_per_second": 23.28,
120
+ "step": 250
121
+ },
122
+ {
123
+ "epoch": 3.0557103064066853,
124
+ "grad_norm": 5.752270221710205,
125
+ "learning_rate": 9.08421052631579e-05,
126
+ "loss": 0.4214,
127
+ "step": 275
128
+ },
129
+ {
130
+ "epoch": 3.3342618384401113,
131
+ "grad_norm": 4.633335590362549,
132
+ "learning_rate": 8.95263157894737e-05,
133
+ "loss": 0.4256,
134
+ "step": 300
135
+ },
136
+ {
137
+ "epoch": 3.3342618384401113,
138
+ "eval_loss": 0.3811536431312561,
139
+ "eval_runtime": 13.4686,
140
+ "eval_samples_per_second": 23.759,
141
+ "eval_steps_per_second": 23.759,
142
+ "step": 300
143
+ },
144
+ {
145
+ "epoch": 3.6128133704735377,
146
+ "grad_norm": 6.349725723266602,
147
+ "learning_rate": 8.821052631578948e-05,
148
+ "loss": 0.4207,
149
+ "step": 325
150
+ },
151
+ {
152
+ "epoch": 3.8913649025069637,
153
+ "grad_norm": 6.114583492279053,
154
+ "learning_rate": 8.689473684210526e-05,
155
+ "loss": 0.4232,
156
+ "step": 350
157
+ },
158
+ {
159
+ "epoch": 3.8913649025069637,
160
+ "eval_loss": 0.41359782218933105,
161
+ "eval_runtime": 13.645,
162
+ "eval_samples_per_second": 23.452,
163
+ "eval_steps_per_second": 23.452,
164
+ "step": 350
165
+ },
166
+ {
167
+ "epoch": 4.167130919220056,
168
+ "grad_norm": 3.0233688354492188,
169
+ "learning_rate": 8.557894736842106e-05,
170
+ "loss": 0.4173,
171
+ "step": 375
172
+ },
173
+ {
174
+ "epoch": 4.445682451253482,
175
+ "grad_norm": 3.7900898456573486,
176
+ "learning_rate": 8.426315789473684e-05,
177
+ "loss": 0.4188,
178
+ "step": 400
179
+ },
180
+ {
181
+ "epoch": 4.445682451253482,
182
+ "eval_loss": 0.37991732358932495,
183
+ "eval_runtime": 13.601,
184
+ "eval_samples_per_second": 23.528,
185
+ "eval_steps_per_second": 23.528,
186
+ "step": 400
187
+ },
188
+ {
189
+ "epoch": 4.724233983286908,
190
+ "grad_norm": 4.321193218231201,
191
+ "learning_rate": 8.294736842105263e-05,
192
+ "loss": 0.4194,
193
+ "step": 425
194
+ },
195
+ {
196
+ "epoch": 5.0,
197
+ "grad_norm": 4.552373886108398,
198
+ "learning_rate": 8.163157894736843e-05,
199
+ "loss": 0.4045,
200
+ "step": 450
201
+ },
202
+ {
203
+ "epoch": 5.0,
204
+ "eval_loss": 0.37860995531082153,
205
+ "eval_runtime": 13.5918,
206
+ "eval_samples_per_second": 23.544,
207
+ "eval_steps_per_second": 23.544,
208
+ "step": 450
209
+ },
210
+ {
211
+ "epoch": 5.278551532033426,
212
+ "grad_norm": 1.4818230867385864,
213
+ "learning_rate": 8.031578947368421e-05,
214
+ "loss": 0.4053,
215
+ "step": 475
216
+ },
217
+ {
218
+ "epoch": 5.557103064066853,
219
+ "grad_norm": 6.262104511260986,
220
+ "learning_rate": 7.900000000000001e-05,
221
+ "loss": 0.4057,
222
+ "step": 500
223
+ },
224
+ {
225
+ "epoch": 5.557103064066853,
226
+ "eval_loss": 0.3852214217185974,
227
+ "eval_runtime": 13.5514,
228
+ "eval_samples_per_second": 23.614,
229
+ "eval_steps_per_second": 23.614,
230
+ "step": 500
231
+ },
232
+ {
233
+ "epoch": 5.835654596100278,
234
+ "grad_norm": 5.743201732635498,
235
+ "learning_rate": 7.768421052631579e-05,
236
+ "loss": 0.409,
237
+ "step": 525
238
+ },
239
+ {
240
+ "epoch": 6.111420612813371,
241
+ "grad_norm": 4.577704906463623,
242
+ "learning_rate": 7.636842105263157e-05,
243
+ "loss": 0.404,
244
+ "step": 550
245
+ },
246
+ {
247
+ "epoch": 6.111420612813371,
248
+ "eval_loss": 0.3758087754249573,
249
+ "eval_runtime": 13.5075,
250
+ "eval_samples_per_second": 23.691,
251
+ "eval_steps_per_second": 23.691,
252
+ "step": 550
253
+ },
254
+ {
255
+ "epoch": 6.389972144846797,
256
+ "grad_norm": 2.2797679901123047,
257
+ "learning_rate": 7.505263157894737e-05,
258
+ "loss": 0.4036,
259
+ "step": 575
260
+ },
261
+ {
262
+ "epoch": 6.6685236768802225,
263
+ "grad_norm": 2.6362528800964355,
264
+ "learning_rate": 7.373684210526317e-05,
265
+ "loss": 0.4013,
266
+ "step": 600
267
+ },
268
+ {
269
+ "epoch": 6.6685236768802225,
270
+ "eval_loss": 0.3783581852912903,
271
+ "eval_runtime": 13.7273,
272
+ "eval_samples_per_second": 23.311,
273
+ "eval_steps_per_second": 23.311,
274
+ "step": 600
275
+ },
276
+ {
277
+ "epoch": 6.947075208913649,
278
+ "grad_norm": 2.8619472980499268,
279
+ "learning_rate": 7.242105263157896e-05,
280
+ "loss": 0.401,
281
+ "step": 625
282
+ },
283
+ {
284
+ "epoch": 7.222841225626741,
285
+ "grad_norm": 2.071786403656006,
286
+ "learning_rate": 7.110526315789474e-05,
287
+ "loss": 0.3919,
288
+ "step": 650
289
+ },
290
+ {
291
+ "epoch": 7.222841225626741,
292
+ "eval_loss": 0.3806610703468323,
293
+ "eval_runtime": 13.6224,
294
+ "eval_samples_per_second": 23.491,
295
+ "eval_steps_per_second": 23.491,
296
+ "step": 650
297
+ },
298
+ {
299
+ "epoch": 7.501392757660167,
300
+ "grad_norm": 5.365797519683838,
301
+ "learning_rate": 6.978947368421053e-05,
302
+ "loss": 0.4018,
303
+ "step": 675
304
+ },
305
+ {
306
+ "epoch": 7.779944289693593,
307
+ "grad_norm": 5.850217819213867,
308
+ "learning_rate": 6.847368421052632e-05,
309
+ "loss": 0.4055,
310
+ "step": 700
311
+ },
312
+ {
313
+ "epoch": 7.779944289693593,
314
+ "eval_loss": 0.3773684501647949,
315
+ "eval_runtime": 13.6955,
316
+ "eval_samples_per_second": 23.365,
317
+ "eval_steps_per_second": 23.365,
318
+ "step": 700
319
+ },
320
+ {
321
+ "epoch": 8.055710306406684,
322
+ "grad_norm": 2.516648530960083,
323
+ "learning_rate": 6.71578947368421e-05,
324
+ "loss": 0.3867,
325
+ "step": 725
326
+ },
327
+ {
328
+ "epoch": 8.334261838440112,
329
+ "grad_norm": 3.2627480030059814,
330
+ "learning_rate": 6.584210526315789e-05,
331
+ "loss": 0.3919,
332
+ "step": 750
333
+ },
334
+ {
335
+ "epoch": 8.334261838440112,
336
+ "eval_loss": 0.381404310464859,
337
+ "eval_runtime": 13.6865,
338
+ "eval_samples_per_second": 23.381,
339
+ "eval_steps_per_second": 23.381,
340
+ "step": 750
341
+ },
342
+ {
343
+ "epoch": 8.612813370473537,
344
+ "grad_norm": 1.986242651939392,
345
+ "learning_rate": 6.452631578947368e-05,
346
+ "loss": 0.3972,
347
+ "step": 775
348
+ },
349
+ {
350
+ "epoch": 8.891364902506965,
351
+ "grad_norm": 2.2203171253204346,
352
+ "learning_rate": 6.321052631578948e-05,
353
+ "loss": 0.3939,
354
+ "step": 800
355
+ },
356
+ {
357
+ "epoch": 8.891364902506965,
358
+ "eval_loss": 0.38715943694114685,
359
+ "eval_runtime": 13.4294,
360
+ "eval_samples_per_second": 23.828,
361
+ "eval_steps_per_second": 23.828,
362
+ "step": 800
363
+ },
364
+ {
365
+ "epoch": 9.167130919220055,
366
+ "grad_norm": 2.0610949993133545,
367
+ "learning_rate": 6.189473684210526e-05,
368
+ "loss": 0.3908,
369
+ "step": 825
370
+ },
371
+ {
372
+ "epoch": 9.445682451253482,
373
+ "grad_norm": 1.9484410285949707,
374
+ "learning_rate": 6.057894736842106e-05,
375
+ "loss": 0.3863,
376
+ "step": 850
377
+ },
378
+ {
379
+ "epoch": 9.445682451253482,
380
+ "eval_loss": 0.36479219794273376,
381
+ "eval_runtime": 13.6905,
382
+ "eval_samples_per_second": 23.374,
383
+ "eval_steps_per_second": 23.374,
384
+ "step": 850
385
+ },
386
+ {
387
+ "epoch": 9.724233983286908,
388
+ "grad_norm": 3.732637882232666,
389
+ "learning_rate": 5.926315789473684e-05,
390
+ "loss": 0.3912,
391
+ "step": 875
392
+ },
393
+ {
394
+ "epoch": 10.0,
395
+ "grad_norm": 2.3195526599884033,
396
+ "learning_rate": 5.794736842105264e-05,
397
+ "loss": 0.3868,
398
+ "step": 900
399
+ },
400
+ {
401
+ "epoch": 10.0,
402
+ "eval_loss": 0.36887454986572266,
403
+ "eval_runtime": 13.6683,
404
+ "eval_samples_per_second": 23.412,
405
+ "eval_steps_per_second": 23.412,
406
+ "step": 900
407
+ },
408
+ {
409
+ "epoch": 10.278551532033426,
410
+ "grad_norm": 1.2468181848526,
411
+ "learning_rate": 5.6631578947368426e-05,
412
+ "loss": 0.3929,
413
+ "step": 925
414
+ },
415
+ {
416
+ "epoch": 10.557103064066853,
417
+ "grad_norm": 2.238741159439087,
418
+ "learning_rate": 5.531578947368421e-05,
419
+ "loss": 0.3863,
420
+ "step": 950
421
+ },
422
+ {
423
+ "epoch": 10.557103064066853,
424
+ "eval_loss": 0.3635193407535553,
425
+ "eval_runtime": 13.8549,
426
+ "eval_samples_per_second": 23.097,
427
+ "eval_steps_per_second": 23.097,
428
+ "step": 950
429
+ },
430
+ {
431
+ "epoch": 10.835654596100278,
432
+ "grad_norm": 1.6164511442184448,
433
+ "learning_rate": 5.4000000000000005e-05,
434
+ "loss": 0.3888,
435
+ "step": 975
436
+ },
437
+ {
438
+ "epoch": 11.11142061281337,
439
+ "grad_norm": 3.874614715576172,
440
+ "learning_rate": 5.2684210526315794e-05,
441
+ "loss": 0.3789,
442
+ "step": 1000
443
+ },
444
+ {
445
+ "epoch": 11.11142061281337,
446
+ "eval_loss": 0.3707486689090729,
447
+ "eval_runtime": 13.6538,
448
+ "eval_samples_per_second": 23.437,
449
+ "eval_steps_per_second": 23.437,
450
+ "step": 1000
451
+ },
452
+ {
453
+ "epoch": 11.389972144846796,
454
+ "grad_norm": 2.4590909481048584,
455
+ "learning_rate": 5.1368421052631576e-05,
456
+ "loss": 0.3864,
457
+ "step": 1025
458
+ },
459
+ {
460
+ "epoch": 11.668523676880223,
461
+ "grad_norm": 2.916522741317749,
462
+ "learning_rate": 5.005263157894737e-05,
463
+ "loss": 0.3829,
464
+ "step": 1050
465
+ },
466
+ {
467
+ "epoch": 11.668523676880223,
468
+ "eval_loss": 0.36959362030029297,
469
+ "eval_runtime": 13.6444,
470
+ "eval_samples_per_second": 23.453,
471
+ "eval_steps_per_second": 23.453,
472
+ "step": 1050
473
+ },
474
+ {
475
+ "epoch": 11.947075208913649,
476
+ "grad_norm": 2.2767789363861084,
477
+ "learning_rate": 4.873684210526316e-05,
478
+ "loss": 0.3811,
479
+ "step": 1075
480
+ },
481
+ {
482
+ "epoch": 12.222841225626741,
483
+ "grad_norm": 2.320681095123291,
484
+ "learning_rate": 4.742105263157895e-05,
485
+ "loss": 0.3771,
486
+ "step": 1100
487
+ },
488
+ {
489
+ "epoch": 12.222841225626741,
490
+ "eval_loss": 0.37226706743240356,
491
+ "eval_runtime": 13.6934,
492
+ "eval_samples_per_second": 23.369,
493
+ "eval_steps_per_second": 23.369,
494
+ "step": 1100
495
+ },
496
+ {
497
+ "epoch": 12.501392757660167,
498
+ "grad_norm": 2.783123254776001,
499
+ "learning_rate": 4.610526315789474e-05,
500
+ "loss": 0.3785,
501
+ "step": 1125
502
+ },
503
+ {
504
+ "epoch": 12.779944289693594,
505
+ "grad_norm": 2.488577365875244,
506
+ "learning_rate": 4.478947368421053e-05,
507
+ "loss": 0.3811,
508
+ "step": 1150
509
+ },
510
+ {
511
+ "epoch": 12.779944289693594,
512
+ "eval_loss": 0.36161237955093384,
513
+ "eval_runtime": 13.7581,
514
+ "eval_samples_per_second": 23.259,
515
+ "eval_steps_per_second": 23.259,
516
+ "step": 1150
517
+ },
518
+ {
519
+ "epoch": 13.055710306406684,
520
+ "grad_norm": 2.4364013671875,
521
+ "learning_rate": 4.347368421052632e-05,
522
+ "loss": 0.3775,
523
+ "step": 1175
524
+ },
525
+ {
526
+ "epoch": 13.334261838440112,
527
+ "grad_norm": 4.950360298156738,
528
+ "learning_rate": 4.215789473684211e-05,
529
+ "loss": 0.3766,
530
+ "step": 1200
531
+ },
532
+ {
533
+ "epoch": 13.334261838440112,
534
+ "eval_loss": 0.3663933575153351,
535
+ "eval_runtime": 13.7363,
536
+ "eval_samples_per_second": 23.296,
537
+ "eval_steps_per_second": 23.296,
538
+ "step": 1200
539
+ },
540
+ {
541
+ "epoch": 13.612813370473537,
542
+ "grad_norm": 3.373661994934082,
543
+ "learning_rate": 4.08421052631579e-05,
544
+ "loss": 0.3736,
545
+ "step": 1225
546
+ },
547
+ {
548
+ "epoch": 13.891364902506965,
549
+ "grad_norm": 6.832035541534424,
550
+ "learning_rate": 3.9526315789473686e-05,
551
+ "loss": 0.3808,
552
+ "step": 1250
553
+ },
554
+ {
555
+ "epoch": 13.891364902506965,
556
+ "eval_loss": 0.36550408601760864,
557
+ "eval_runtime": 13.5552,
558
+ "eval_samples_per_second": 23.607,
559
+ "eval_steps_per_second": 23.607,
560
+ "step": 1250
561
+ },
562
+ {
563
+ "epoch": 14.167130919220055,
564
+ "grad_norm": 3.087953567504883,
565
+ "learning_rate": 3.8210526315789476e-05,
566
+ "loss": 0.3706,
567
+ "step": 1275
568
+ },
569
+ {
570
+ "epoch": 14.445682451253482,
571
+ "grad_norm": 2.505366802215576,
572
+ "learning_rate": 3.6894736842105265e-05,
573
+ "loss": 0.3748,
574
+ "step": 1300
575
+ },
576
+ {
577
+ "epoch": 14.445682451253482,
578
+ "eval_loss": 0.37319216132164,
579
+ "eval_runtime": 13.6213,
580
+ "eval_samples_per_second": 23.493,
581
+ "eval_steps_per_second": 23.493,
582
+ "step": 1300
583
+ },
584
+ {
585
+ "epoch": 14.724233983286908,
586
+ "grad_norm": 3.5145950317382812,
587
+ "learning_rate": 3.5578947368421054e-05,
588
+ "loss": 0.3768,
589
+ "step": 1325
590
+ },
591
+ {
592
+ "epoch": 15.0,
593
+ "grad_norm": 1.5625571012496948,
594
+ "learning_rate": 3.426315789473684e-05,
595
+ "loss": 0.3694,
596
+ "step": 1350
597
+ },
598
+ {
599
+ "epoch": 15.0,
600
+ "eval_loss": 0.36149922013282776,
601
+ "eval_runtime": 13.6694,
602
+ "eval_samples_per_second": 23.41,
603
+ "eval_steps_per_second": 23.41,
604
+ "step": 1350
605
+ },
606
+ {
607
+ "epoch": 15.278551532033426,
608
+ "grad_norm": 2.3763480186462402,
609
+ "learning_rate": 3.294736842105263e-05,
610
+ "loss": 0.3698,
611
+ "step": 1375
612
+ },
613
+ {
614
+ "epoch": 15.557103064066853,
615
+ "grad_norm": 2.2418417930603027,
616
+ "learning_rate": 3.163157894736842e-05,
617
+ "loss": 0.3743,
618
+ "step": 1400
619
+ },
620
+ {
621
+ "epoch": 15.557103064066853,
622
+ "eval_loss": 0.3747350573539734,
623
+ "eval_runtime": 13.5269,
624
+ "eval_samples_per_second": 23.656,
625
+ "eval_steps_per_second": 23.656,
626
+ "step": 1400
627
+ },
628
+ {
629
+ "epoch": 15.835654596100278,
630
+ "grad_norm": 1.8634096384048462,
631
+ "learning_rate": 3.0315789473684214e-05,
632
+ "loss": 0.3714,
633
+ "step": 1425
634
+ },
635
+ {
636
+ "epoch": 16.11142061281337,
637
+ "grad_norm": 2.3779730796813965,
638
+ "learning_rate": 2.9e-05,
639
+ "loss": 0.3698,
640
+ "step": 1450
641
+ },
642
+ {
643
+ "epoch": 16.11142061281337,
644
+ "eval_loss": 0.37593525648117065,
645
+ "eval_runtime": 13.6244,
646
+ "eval_samples_per_second": 23.487,
647
+ "eval_steps_per_second": 23.487,
648
+ "step": 1450
649
+ },
650
+ {
651
+ "epoch": 16.389972144846798,
652
+ "grad_norm": 1.6371415853500366,
653
+ "learning_rate": 2.768421052631579e-05,
654
+ "loss": 0.3708,
655
+ "step": 1475
656
+ },
657
+ {
658
+ "epoch": 16.668523676880223,
659
+ "grad_norm": 2.469703435897827,
660
+ "learning_rate": 2.6368421052631582e-05,
661
+ "loss": 0.3708,
662
+ "step": 1500
663
+ },
664
+ {
665
+ "epoch": 16.668523676880223,
666
+ "eval_loss": 0.36566680669784546,
667
+ "eval_runtime": 13.6076,
668
+ "eval_samples_per_second": 23.516,
669
+ "eval_steps_per_second": 23.516,
670
+ "step": 1500
671
+ },
672
+ {
673
+ "epoch": 16.94707520891365,
674
+ "grad_norm": 2.27885365486145,
675
+ "learning_rate": 2.505263157894737e-05,
676
+ "loss": 0.3688,
677
+ "step": 1525
678
+ },
679
+ {
680
+ "epoch": 17.22284122562674,
681
+ "grad_norm": 1.7022783756256104,
682
+ "learning_rate": 2.373684210526316e-05,
683
+ "loss": 0.3626,
684
+ "step": 1550
685
+ },
686
+ {
687
+ "epoch": 17.22284122562674,
688
+ "eval_loss": 0.366330623626709,
689
+ "eval_runtime": 13.6427,
690
+ "eval_samples_per_second": 23.456,
691
+ "eval_steps_per_second": 23.456,
692
+ "step": 1550
693
+ },
694
+ {
695
+ "epoch": 17.501392757660167,
696
+ "grad_norm": 2.10900616645813,
697
+ "learning_rate": 2.242105263157895e-05,
698
+ "loss": 0.3678,
699
+ "step": 1575
700
+ },
701
+ {
702
+ "epoch": 17.779944289693592,
703
+ "grad_norm": 1.427308201789856,
704
+ "learning_rate": 2.110526315789474e-05,
705
+ "loss": 0.3671,
706
+ "step": 1600
707
+ },
708
+ {
709
+ "epoch": 17.779944289693592,
710
+ "eval_loss": 0.3622135519981384,
711
+ "eval_runtime": 13.6336,
712
+ "eval_samples_per_second": 23.471,
713
+ "eval_steps_per_second": 23.471,
714
+ "step": 1600
715
+ },
716
+ {
717
+ "epoch": 18.055710306406684,
718
+ "grad_norm": 2.66011905670166,
719
+ "learning_rate": 1.9789473684210528e-05,
720
+ "loss": 0.3629,
721
+ "step": 1625
722
+ },
723
+ {
724
+ "epoch": 18.33426183844011,
725
+ "grad_norm": 1.7077386379241943,
726
+ "learning_rate": 1.8473684210526317e-05,
727
+ "loss": 0.365,
728
+ "step": 1650
729
+ },
730
+ {
731
+ "epoch": 18.33426183844011,
732
+ "eval_loss": 0.3650280833244324,
733
+ "eval_runtime": 13.8977,
734
+ "eval_samples_per_second": 23.025,
735
+ "eval_steps_per_second": 23.025,
736
+ "step": 1650
737
+ },
738
+ {
739
+ "epoch": 18.61281337047354,
740
+ "grad_norm": 2.3039278984069824,
741
+ "learning_rate": 1.7157894736842107e-05,
742
+ "loss": 0.3657,
743
+ "step": 1675
744
+ },
745
+ {
746
+ "epoch": 18.891364902506965,
747
+ "grad_norm": 2.4378602504730225,
748
+ "learning_rate": 1.5842105263157896e-05,
749
+ "loss": 0.3614,
750
+ "step": 1700
751
+ },
752
+ {
753
+ "epoch": 18.891364902506965,
754
+ "eval_loss": 0.366611123085022,
755
+ "eval_runtime": 13.6276,
756
+ "eval_samples_per_second": 23.482,
757
+ "eval_steps_per_second": 23.482,
758
+ "step": 1700
759
+ },
760
+ {
761
+ "epoch": 19.167130919220057,
762
+ "grad_norm": 2.5396945476531982,
763
+ "learning_rate": 1.4526315789473685e-05,
764
+ "loss": 0.3588,
765
+ "step": 1725
766
+ },
767
+ {
768
+ "epoch": 19.445682451253482,
769
+ "grad_norm": 1.7115105390548706,
770
+ "learning_rate": 1.3210526315789473e-05,
771
+ "loss": 0.3647,
772
+ "step": 1750
773
+ },
774
+ {
775
+ "epoch": 19.445682451253482,
776
+ "eval_loss": 0.36524954438209534,
777
+ "eval_runtime": 13.6929,
778
+ "eval_samples_per_second": 23.37,
779
+ "eval_steps_per_second": 23.37,
780
+ "step": 1750
781
+ },
782
+ {
783
+ "epoch": 19.724233983286908,
784
+ "grad_norm": 3.1968226432800293,
785
+ "learning_rate": 1.1894736842105264e-05,
786
+ "loss": 0.3646,
787
+ "step": 1775
788
+ },
789
+ {
790
+ "epoch": 20.0,
791
+ "grad_norm": 3.1331191062927246,
792
+ "learning_rate": 1.0578947368421053e-05,
793
+ "loss": 0.3585,
794
+ "step": 1800
795
+ },
796
+ {
797
+ "epoch": 20.0,
798
+ "eval_loss": 0.3763583302497864,
799
+ "eval_runtime": 13.7144,
800
+ "eval_samples_per_second": 23.333,
801
+ "eval_steps_per_second": 23.333,
802
+ "step": 1800
803
+ },
804
+ {
805
+ "epoch": 20.278551532033426,
806
+ "grad_norm": 1.4660519361495972,
807
+ "learning_rate": 9.263157894736844e-06,
808
+ "loss": 0.3633,
809
+ "step": 1825
810
+ },
811
+ {
812
+ "epoch": 20.55710306406685,
813
+ "grad_norm": 1.6120601892471313,
814
+ "learning_rate": 7.947368421052633e-06,
815
+ "loss": 0.3606,
816
+ "step": 1850
817
+ },
818
+ {
819
+ "epoch": 20.55710306406685,
820
+ "eval_loss": 0.37242794036865234,
821
+ "eval_runtime": 13.6577,
822
+ "eval_samples_per_second": 23.43,
823
+ "eval_steps_per_second": 23.43,
824
+ "step": 1850
825
+ },
826
+ {
827
+ "epoch": 20.83565459610028,
828
+ "grad_norm": 1.7700914144515991,
829
+ "learning_rate": 6.631578947368422e-06,
830
+ "loss": 0.3592,
831
+ "step": 1875
832
+ },
833
+ {
834
+ "epoch": 21.11142061281337,
835
+ "grad_norm": 1.610386610031128,
836
+ "learning_rate": 5.315789473684211e-06,
837
+ "loss": 0.358,
838
+ "step": 1900
839
+ },
840
+ {
841
+ "epoch": 21.11142061281337,
842
+ "eval_loss": 0.36805444955825806,
843
+ "eval_runtime": 13.4893,
844
+ "eval_samples_per_second": 23.723,
845
+ "eval_steps_per_second": 23.723,
846
+ "step": 1900
847
+ },
848
+ {
849
+ "epoch": 21.389972144846798,
850
+ "grad_norm": 1.5456234216690063,
851
+ "learning_rate": 4.000000000000001e-06,
852
+ "loss": 0.357,
853
+ "step": 1925
854
+ },
855
+ {
856
+ "epoch": 21.668523676880223,
857
+ "grad_norm": 1.233559489250183,
858
+ "learning_rate": 2.68421052631579e-06,
859
+ "loss": 0.3566,
860
+ "step": 1950
861
+ },
862
+ {
863
+ "epoch": 21.668523676880223,
864
+ "eval_loss": 0.37138742208480835,
865
+ "eval_runtime": 13.4793,
866
+ "eval_samples_per_second": 23.74,
867
+ "eval_steps_per_second": 23.74,
868
+ "step": 1950
869
+ },
870
+ {
871
+ "epoch": 21.94707520891365,
872
+ "grad_norm": 1.3927557468414307,
873
+ "learning_rate": 1.3684210526315791e-06,
874
+ "loss": 0.3578,
875
+ "step": 1975
876
+ },
877
+ {
878
+ "epoch": 22.22284122562674,
879
+ "grad_norm": 1.2782479524612427,
880
+ "learning_rate": 5.263157894736842e-08,
881
+ "loss": 0.3539,
882
+ "step": 2000
883
+ },
884
+ {
885
+ "epoch": 22.22284122562674,
886
+ "eval_loss": 0.3719028830528259,
887
+ "eval_runtime": 13.4208,
888
+ "eval_samples_per_second": 23.844,
889
+ "eval_steps_per_second": 23.844,
890
+ "step": 2000
891
+ }
892
+ ],
893
+ "logging_steps": 25,
894
+ "max_steps": 2000,
895
+ "num_input_tokens_seen": 0,
896
+ "num_train_epochs": 23,
897
+ "save_steps": 100,
898
+ "stateful_callbacks": {
899
+ "TrainerControl": {
900
+ "args": {
901
+ "should_epoch_stop": false,
902
+ "should_evaluate": false,
903
+ "should_log": false,
904
+ "should_save": true,
905
+ "should_training_stop": true
906
+ },
907
+ "attributes": {}
908
+ }
909
+ },
910
+ "total_flos": 5863317642483840.0,
911
+ "train_batch_size": 4,
912
+ "trial_name": null,
913
+ "trial_params": null
914
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9e776f34f67b0ce79831e4e54c3792782a1ed1a0ce25e8325d66402335a8dc
3
+ size 5496