shivanandmn commited on
Commit
a847066
·
verified ·
1 Parent(s): c865815

Model save

Browse files
Files changed (6) hide show
  1. README.md +21 -21
  2. all_results.json +12 -12
  3. eval_results.json +8 -8
  4. train_results.json +4 -4
  5. trainer_state.json +299 -299
  6. training_args.bin +1 -1
README.md CHANGED
@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 3.1985
21
  - Accuracy: 0.4196
22
- - Perplexity: 24.4954
23
  - Bleu: 0.1339
 
 
24
 
25
  ## Model description
26
 
@@ -50,25 +50,25 @@ The following hyperparameters were used during training:
50
 
51
  ### Training results
52
 
53
- | Training Loss | Epoch | Step | Validation Loss | Accuracy | Perplexity | Bleu |
54
- |:-------------:|:------:|:----:|:---------------:|:--------:|:----------:|:------:|
55
- | 5.9062 | 0.2806 | 500 | 5.7470 | 0.2234 | 313.2463 | 0.0493 |
56
- | 4.8598 | 0.5612 | 1000 | 4.7428 | 0.2811 | 114.7554 | 0.0698 |
57
- | 4.3025 | 0.8418 | 1500 | 4.2329 | 0.3170 | 68.9191 | 0.0834 |
58
- | 3.9635 | 1.1223 | 2000 | 3.9291 | 0.3454 | 50.8590 | 0.0932 |
59
- | 3.7769 | 1.4029 | 2500 | 3.7427 | 0.3636 | 42.2098 | 0.1020 |
60
- | 3.6738 | 1.6835 | 3000 | 3.6225 | 0.3754 | 37.4295 | 0.1066 |
61
- | 3.5744 | 1.9641 | 3500 | 3.5325 | 0.3845 | 34.2102 | 0.1118 |
62
- | 3.456 | 2.2447 | 4000 | 3.4704 | 0.3902 | 32.1497 | 0.1139 |
63
- | 3.3972 | 2.5253 | 4500 | 3.4190 | 0.3955 | 30.5384 | 0.1230 |
64
- | 3.3654 | 2.8058 | 5000 | 3.3686 | 0.4007 | 29.0392 | 0.1230 |
65
- | 3.247 | 3.0864 | 5500 | 3.3328 | 0.4043 | 28.0168 | 0.1247 |
66
- | 3.2403 | 3.3670 | 6000 | 3.2985 | 0.4083 | 27.0714 | 0.1298 |
67
- | 3.2167 | 3.6476 | 6500 | 3.2693 | 0.4112 | 26.2922 | 0.1288 |
68
- | 3.1903 | 3.9282 | 7000 | 3.2456 | 0.4134 | 25.6768 | 0.1305 |
69
- | 3.1212 | 4.2088 | 7500 | 3.2262 | 0.4161 | 25.1831 | 0.1325 |
70
- | 3.0816 | 4.4893 | 8000 | 3.2128 | 0.4176 | 24.8480 | 0.1307 |
71
- | 3.0917 | 4.7699 | 8500 | 3.1985 | 0.4196 | 24.4954 | 0.1339 |
72
 
73
 
74
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
 
20
  - Accuracy: 0.4196
 
21
  - Bleu: 0.1339
22
+ - Loss: 3.1985
23
+ - Perplexity: 24.4954
24
 
25
  ## Model description
26
 
 
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Accuracy | Bleu | Validation Loss | Perplexity |
54
+ |:-------------:|:------:|:----:|:--------:|:------:|:---------------:|:----------:|
55
+ | 5.9062 | 0.2806 | 500 | 0.2234 | 0.0493 | 5.7470 | 313.2463 |
56
+ | 4.8598 | 0.5612 | 1000 | 0.2811 | 0.0698 | 4.7428 | 114.7554 |
57
+ | 4.3025 | 0.8418 | 1500 | 0.3170 | 0.0834 | 4.2329 | 68.9191 |
58
+ | 3.9635 | 1.1223 | 2000 | 0.3454 | 0.0932 | 3.9291 | 50.8590 |
59
+ | 3.7769 | 1.4029 | 2500 | 0.3636 | 0.1020 | 3.7427 | 42.2098 |
60
+ | 3.6738 | 1.6835 | 3000 | 0.3754 | 0.1066 | 3.6225 | 37.4295 |
61
+ | 3.5744 | 1.9641 | 3500 | 0.3845 | 0.1118 | 3.5325 | 34.2102 |
62
+ | 3.456 | 2.2447 | 4000 | 0.3902 | 0.1139 | 3.4704 | 32.1497 |
63
+ | 3.3972 | 2.5253 | 4500 | 0.3955 | 0.1230 | 3.4190 | 30.5384 |
64
+ | 3.3654 | 2.8058 | 5000 | 0.4007 | 0.1230 | 3.3686 | 29.0392 |
65
+ | 3.247 | 3.0864 | 5500 | 0.4043 | 0.1247 | 3.3328 | 28.0168 |
66
+ | 3.2403 | 3.3670 | 6000 | 0.4083 | 0.1298 | 3.2985 | 27.0714 |
67
+ | 3.2167 | 3.6476 | 6500 | 0.4112 | 0.1288 | 3.2693 | 26.2922 |
68
+ | 3.1903 | 3.9282 | 7000 | 0.4134 | 0.1305 | 3.2456 | 25.6768 |
69
+ | 3.1212 | 4.2088 | 7500 | 0.4161 | 0.1325 | 3.2262 | 25.1831 |
70
+ | 3.0816 | 4.4893 | 8000 | 0.4176 | 0.1307 | 3.2128 | 24.8480 |
71
+ | 3.0917 | 4.7699 | 8500 | 0.4196 | 0.1339 | 3.1985 | 24.4954 |
72
 
73
 
74
  ### Framework versions
all_results.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.4192811608208402,
4
- "eval_bleu": 0.13449590326973765,
5
- "eval_loss": 3.201490640640259,
6
- "eval_perplexity": 24.569126652935235,
7
- "eval_runtime": 12.2552,
8
  "eval_samples": 1141,
9
- "eval_samples_per_second": 93.103,
10
- "eval_steps_per_second": 1.469,
11
- "perplexity": 24.569126652935235,
12
  "total_flos": 1.0587061010143642e+18,
13
- "train_loss": 3.7750147520893753,
14
- "train_runtime": 15697.9119,
15
- "train_samples_per_second": 36.309,
16
- "train_steps_per_second": 0.568
17
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.41997167684877956,
4
+ "eval_bleu": 0.13436705184218095,
5
+ "eval_loss": 3.19417405128479,
6
+ "eval_perplexity": 24.39002046460787,
7
+ "eval_runtime": 12.1627,
8
  "eval_samples": 1141,
9
+ "eval_samples_per_second": 93.812,
10
+ "eval_steps_per_second": 1.48,
11
+ "perplexity": 24.39002046460787,
12
  "total_flos": 1.0587061010143642e+18,
13
+ "train_loss": 3.767289323945907,
14
+ "train_runtime": 15738.6672,
15
+ "train_samples_per_second": 36.215,
16
+ "train_steps_per_second": 0.566
17
  }
eval_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.4192811608208402,
4
- "eval_bleu": 0.13449590326973765,
5
- "eval_loss": 3.201490640640259,
6
- "eval_perplexity": 24.569126652935235,
7
- "eval_runtime": 12.2552,
8
  "eval_samples": 1141,
9
- "eval_samples_per_second": 93.103,
10
- "eval_steps_per_second": 1.469,
11
- "perplexity": 24.569126652935235
12
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.41997167684877956,
4
+ "eval_bleu": 0.13436705184218095,
5
+ "eval_loss": 3.19417405128479,
6
+ "eval_perplexity": 24.39002046460787,
7
+ "eval_runtime": 12.1627,
8
  "eval_samples": 1141,
9
+ "eval_samples_per_second": 93.812,
10
+ "eval_steps_per_second": 1.48,
11
+ "perplexity": 24.39002046460787
12
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
  "total_flos": 1.0587061010143642e+18,
4
- "train_loss": 3.7750147520893753,
5
- "train_runtime": 15697.9119,
6
- "train_samples_per_second": 36.309,
7
- "train_steps_per_second": 0.568
8
  }
 
1
  {
2
  "epoch": 5.0,
3
  "total_flos": 1.0587061010143642e+18,
4
+ "train_loss": 3.767289323945907,
5
+ "train_runtime": 15738.6672,
6
+ "train_samples_per_second": 36.215,
7
+ "train_steps_per_second": 0.566
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 3.2064151763916016,
3
  "best_model_checkpoint": "./output/models/rotating-head-gp-gpt2-medium-wikitext/checkpoint-8500",
4
  "epoch": 5.0,
5
  "eval_steps": 500,
@@ -10,822 +10,822 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05611672278338945,
13
- "grad_norm": 1.7030696868896484,
14
  "learning_rate": 1.1223344556677892e-05,
15
  "loss": 8.993,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.1122334455667789,
20
- "grad_norm": 1.4508291482925415,
21
  "learning_rate": 2.2446689113355783e-05,
22
  "loss": 7.3838,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.16835016835016836,
27
- "grad_norm": 1.6375716924667358,
28
  "learning_rate": 3.3670033670033675e-05,
29
- "loss": 6.5795,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.2244668911335578,
34
- "grad_norm": 1.417483925819397,
35
  "learning_rate": 4.4893378226711566e-05,
36
- "loss": 6.1961,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.28058361391694725,
41
- "grad_norm": 1.7931348085403442,
42
  "learning_rate": 5.611672278338945e-05,
43
- "loss": 5.9051,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.28058361391694725,
48
- "eval_accuracy": 0.2238085814179224,
49
- "eval_bleu": 0.04813423949717817,
50
- "eval_loss": 5.7417893409729,
51
- "eval_perplexity": 311.6215095481783,
52
- "eval_runtime": 12.102,
53
- "eval_samples_per_second": 94.282,
54
- "eval_steps_per_second": 1.487,
55
  "step": 500
56
  },
57
  {
58
  "epoch": 0.3367003367003367,
59
- "grad_norm": 1.502821445465088,
60
  "learning_rate": 6.734006734006735e-05,
61
- "loss": 5.6571,
62
  "step": 600
63
  },
64
  {
65
  "epoch": 0.39281705948372614,
66
- "grad_norm": 1.267421007156372,
67
  "learning_rate": 7.856341189674523e-05,
68
- "loss": 5.4352,
69
  "step": 700
70
  },
71
  {
72
  "epoch": 0.4489337822671156,
73
- "grad_norm": 1.4694616794586182,
74
  "learning_rate": 8.978675645342313e-05,
75
- "loss": 5.2521,
76
  "step": 800
77
  },
78
  {
79
  "epoch": 0.5050505050505051,
80
- "grad_norm": 1.2315590381622314,
81
  "learning_rate": 9.988776655443322e-05,
82
- "loss": 5.0471,
83
  "step": 900
84
  },
85
  {
86
  "epoch": 0.5611672278338945,
87
- "grad_norm": 0.9069780707359314,
88
  "learning_rate": 9.864072827035791e-05,
89
- "loss": 4.8617,
90
  "step": 1000
91
  },
92
  {
93
  "epoch": 0.5611672278338945,
94
- "eval_accuracy": 0.28116767459732034,
95
- "eval_bleu": 0.07213900102284179,
96
- "eval_loss": 4.740506649017334,
97
- "eval_perplexity": 114.4921943457123,
98
- "eval_runtime": 12.1272,
99
- "eval_samples_per_second": 94.086,
100
- "eval_steps_per_second": 1.484,
101
  "step": 1000
102
  },
103
  {
104
  "epoch": 0.6172839506172839,
105
- "grad_norm": 0.9436570405960083,
106
  "learning_rate": 9.73936899862826e-05,
107
- "loss": 4.7137,
108
  "step": 1100
109
  },
110
  {
111
  "epoch": 0.6734006734006734,
112
- "grad_norm": 0.8758283853530884,
113
  "learning_rate": 9.614665170220725e-05,
114
- "loss": 4.5873,
115
  "step": 1200
116
  },
117
  {
118
  "epoch": 0.7295173961840629,
119
- "grad_norm": 0.8786563277244568,
120
  "learning_rate": 9.489961341813194e-05,
121
- "loss": 4.4798,
122
  "step": 1300
123
  },
124
  {
125
  "epoch": 0.7856341189674523,
126
- "grad_norm": 0.9080842733383179,
127
  "learning_rate": 9.365257513405662e-05,
128
- "loss": 4.3915,
129
  "step": 1400
130
  },
131
  {
132
  "epoch": 0.8417508417508418,
133
- "grad_norm": 0.9573113322257996,
134
  "learning_rate": 9.24055368499813e-05,
135
- "loss": 4.2992,
136
  "step": 1500
137
  },
138
  {
139
  "epoch": 0.8417508417508418,
140
- "eval_accuracy": 0.317916663453968,
141
- "eval_bleu": 0.08316362600465955,
142
- "eval_loss": 4.227725982666016,
143
- "eval_perplexity": 68.56114553586224,
144
- "eval_runtime": 12.2167,
145
- "eval_samples_per_second": 93.397,
146
- "eval_steps_per_second": 1.473,
147
  "step": 1500
148
  },
149
  {
150
  "epoch": 0.8978675645342312,
151
- "grad_norm": 1.070070505142212,
152
  "learning_rate": 9.115849856590598e-05,
153
- "loss": 4.2381,
154
  "step": 1600
155
  },
156
  {
157
  "epoch": 0.9539842873176206,
158
- "grad_norm": 1.0821105241775513,
159
  "learning_rate": 8.991146028183066e-05,
160
- "loss": 4.1477,
161
  "step": 1700
162
  },
163
  {
164
  "epoch": 1.0101010101010102,
165
- "grad_norm": 1.0546807050704956,
166
  "learning_rate": 8.866442199775533e-05,
167
- "loss": 4.0864,
168
  "step": 1800
169
  },
170
  {
171
  "epoch": 1.0662177328843996,
172
- "grad_norm": 1.0770821571350098,
173
  "learning_rate": 8.741738371368002e-05,
174
- "loss": 4.0053,
175
  "step": 1900
176
  },
177
  {
178
  "epoch": 1.122334455667789,
179
- "grad_norm": 0.9692613482475281,
180
  "learning_rate": 8.617034542960469e-05,
181
- "loss": 3.9585,
182
  "step": 2000
183
  },
184
  {
185
  "epoch": 1.122334455667789,
186
- "eval_accuracy": 0.34663219226844794,
187
- "eval_bleu": 0.09053001364011155,
188
- "eval_loss": 3.9252164363861084,
189
- "eval_perplexity": 50.66404241449891,
190
- "eval_runtime": 12.3364,
191
- "eval_samples_per_second": 92.491,
192
- "eval_steps_per_second": 1.459,
193
  "step": 2000
194
  },
195
  {
196
  "epoch": 1.1784511784511784,
197
- "grad_norm": 0.9078754782676697,
198
  "learning_rate": 8.492330714552937e-05,
199
- "loss": 3.9194,
200
  "step": 2100
201
  },
202
  {
203
  "epoch": 1.2345679012345678,
204
- "grad_norm": 1.207867980003357,
205
  "learning_rate": 8.367626886145406e-05,
206
- "loss": 3.8743,
207
  "step": 2200
208
  },
209
  {
210
  "epoch": 1.2906846240179575,
211
- "grad_norm": 1.2701904773712158,
212
  "learning_rate": 8.242923057737873e-05,
213
- "loss": 3.8596,
214
  "step": 2300
215
  },
216
  {
217
  "epoch": 1.3468013468013469,
218
- "grad_norm": 1.4610146284103394,
219
  "learning_rate": 8.11821922933034e-05,
220
- "loss": 3.821,
221
  "step": 2400
222
  },
223
  {
224
  "epoch": 1.4029180695847363,
225
- "grad_norm": 2.153532028198242,
226
  "learning_rate": 7.993515400922809e-05,
227
- "loss": 3.7838,
228
  "step": 2500
229
  },
230
  {
231
  "epoch": 1.4029180695847363,
232
- "eval_accuracy": 0.36260658663191814,
233
- "eval_bleu": 0.09858369505966838,
234
- "eval_loss": 3.7564430236816406,
235
- "eval_perplexity": 42.7959308162683,
236
- "eval_runtime": 12.2808,
237
- "eval_samples_per_second": 92.909,
238
- "eval_steps_per_second": 1.466,
239
  "step": 2500
240
  },
241
  {
242
  "epoch": 1.4590347923681257,
243
- "grad_norm": 1.3955634832382202,
244
  "learning_rate": 7.868811572515277e-05,
245
- "loss": 3.7668,
246
  "step": 2600
247
  },
248
  {
249
  "epoch": 1.5151515151515151,
250
- "grad_norm": 1.3527532815933228,
251
  "learning_rate": 7.744107744107744e-05,
252
- "loss": 3.7424,
253
  "step": 2700
254
  },
255
  {
256
  "epoch": 1.5712682379349046,
257
- "grad_norm": 0.9464800357818604,
258
  "learning_rate": 7.619403915700213e-05,
259
- "loss": 3.708,
260
  "step": 2800
261
  },
262
  {
263
  "epoch": 1.627384960718294,
264
- "grad_norm": 1.9855220317840576,
265
  "learning_rate": 7.49470008729268e-05,
266
- "loss": 3.6981,
267
  "step": 2900
268
  },
269
  {
270
  "epoch": 1.6835016835016834,
271
- "grad_norm": 2.4134933948516846,
272
  "learning_rate": 7.369996258885148e-05,
273
- "loss": 3.6863,
274
  "step": 3000
275
  },
276
  {
277
  "epoch": 1.6835016835016834,
278
- "eval_accuracy": 0.3738518885956052,
279
- "eval_bleu": 0.10686414409488354,
280
- "eval_loss": 3.6387925148010254,
281
- "eval_perplexity": 38.04586915451721,
282
- "eval_runtime": 12.2104,
283
- "eval_samples_per_second": 93.445,
284
- "eval_steps_per_second": 1.474,
285
  "step": 3000
286
  },
287
  {
288
  "epoch": 1.7396184062850728,
289
- "grad_norm": 2.7664296627044678,
290
  "learning_rate": 7.245292430477615e-05,
291
- "loss": 3.6503,
292
  "step": 3100
293
  },
294
  {
295
  "epoch": 1.7957351290684624,
296
- "grad_norm": 2.4914608001708984,
297
  "learning_rate": 7.120588602070084e-05,
298
- "loss": 3.6382,
299
  "step": 3200
300
  },
301
  {
302
  "epoch": 1.8518518518518519,
303
- "grad_norm": 2.2810556888580322,
304
  "learning_rate": 6.995884773662552e-05,
305
- "loss": 3.6126,
306
  "step": 3300
307
  },
308
  {
309
  "epoch": 1.9079685746352413,
310
- "grad_norm": 1.139969825744629,
311
  "learning_rate": 6.871180945255021e-05,
312
- "loss": 3.6042,
313
  "step": 3400
314
  },
315
  {
316
  "epoch": 1.964085297418631,
317
- "grad_norm": 1.3077082633972168,
318
  "learning_rate": 6.746477116847487e-05,
319
- "loss": 3.5869,
320
  "step": 3500
321
  },
322
  {
323
  "epoch": 1.964085297418631,
324
- "eval_accuracy": 0.3825578735533218,
325
- "eval_bleu": 0.11004806187463387,
326
- "eval_loss": 3.5517890453338623,
327
- "eval_perplexity": 34.875655837249155,
328
- "eval_runtime": 12.1547,
329
- "eval_samples_per_second": 93.873,
330
- "eval_steps_per_second": 1.481,
331
  "step": 3500
332
  },
333
  {
334
  "epoch": 2.0202020202020203,
335
- "grad_norm": 2.2133948802948,
336
  "learning_rate": 6.621773288439955e-05,
337
- "loss": 3.5521,
338
  "step": 3600
339
  },
340
  {
341
  "epoch": 2.0763187429854097,
342
- "grad_norm": 1.2750334739685059,
343
  "learning_rate": 6.497069460032424e-05,
344
- "loss": 3.4978,
345
  "step": 3700
346
  },
347
  {
348
  "epoch": 2.132435465768799,
349
- "grad_norm": 1.4778505563735962,
350
  "learning_rate": 6.372365631624892e-05,
351
- "loss": 3.4785,
352
  "step": 3800
353
  },
354
  {
355
  "epoch": 2.1885521885521886,
356
- "grad_norm": 1.5821393728256226,
357
  "learning_rate": 6.247661803217359e-05,
358
- "loss": 3.4804,
359
  "step": 3900
360
  },
361
  {
362
  "epoch": 2.244668911335578,
363
- "grad_norm": 1.2070300579071045,
364
  "learning_rate": 6.122957974809826e-05,
365
- "loss": 3.4733,
366
  "step": 4000
367
  },
368
  {
369
  "epoch": 2.244668911335578,
370
- "eval_accuracy": 0.38860631419507335,
371
- "eval_bleu": 0.11590933164079384,
372
- "eval_loss": 3.4845945835113525,
373
- "eval_perplexity": 32.60920412059944,
374
- "eval_runtime": 12.166,
375
- "eval_samples_per_second": 93.786,
376
- "eval_steps_per_second": 1.48,
377
  "step": 4000
378
  },
379
  {
380
  "epoch": 2.3007856341189674,
381
- "grad_norm": 1.0372384786605835,
382
  "learning_rate": 5.998254146402295e-05,
383
- "loss": 3.4604,
384
  "step": 4100
385
  },
386
  {
387
  "epoch": 2.356902356902357,
388
- "grad_norm": 1.3492435216903687,
389
  "learning_rate": 5.8735503179947625e-05,
390
- "loss": 3.4395,
391
  "step": 4200
392
  },
393
  {
394
  "epoch": 2.4130190796857462,
395
- "grad_norm": 1.0056568384170532,
396
  "learning_rate": 5.748846489587231e-05,
397
- "loss": 3.4359,
398
  "step": 4300
399
  },
400
  {
401
  "epoch": 2.4691358024691357,
402
- "grad_norm": 2.51788067817688,
403
  "learning_rate": 5.624142661179699e-05,
404
- "loss": 3.4198,
405
  "step": 4400
406
  },
407
  {
408
  "epoch": 2.525252525252525,
409
- "grad_norm": 2.3806328773498535,
410
  "learning_rate": 5.4994388327721666e-05,
411
- "loss": 3.4122,
412
  "step": 4500
413
  },
414
  {
415
  "epoch": 2.525252525252525,
416
- "eval_accuracy": 0.39410559754909646,
417
- "eval_bleu": 0.12123148173651561,
418
- "eval_loss": 3.430687665939331,
419
- "eval_perplexity": 30.897882867432642,
420
- "eval_runtime": 12.1642,
421
- "eval_samples_per_second": 93.8,
422
- "eval_steps_per_second": 1.48,
423
  "step": 4500
424
  },
425
  {
426
  "epoch": 2.581369248035915,
427
- "grad_norm": 1.4060360193252563,
428
  "learning_rate": 5.374735004364634e-05,
429
- "loss": 3.4137,
430
  "step": 4600
431
  },
432
  {
433
  "epoch": 2.637485970819304,
434
- "grad_norm": 1.47615647315979,
435
  "learning_rate": 5.250031175957102e-05,
436
- "loss": 3.3987,
437
  "step": 4700
438
  },
439
  {
440
  "epoch": 2.6936026936026938,
441
- "grad_norm": 3.111424446105957,
442
  "learning_rate": 5.12532734754957e-05,
443
- "loss": 3.3934,
444
  "step": 4800
445
  },
446
  {
447
  "epoch": 2.749719416386083,
448
- "grad_norm": 1.385186791419983,
449
  "learning_rate": 5.000623519142038e-05,
450
- "loss": 3.3842,
451
  "step": 4900
452
  },
453
  {
454
  "epoch": 2.8058361391694726,
455
- "grad_norm": 1.4257937669754028,
456
  "learning_rate": 4.8759196907345056e-05,
457
- "loss": 3.3791,
458
  "step": 5000
459
  },
460
  {
461
  "epoch": 2.8058361391694726,
462
- "eval_accuracy": 0.39906000721357937,
463
- "eval_bleu": 0.12230826630862061,
464
- "eval_loss": 3.380352735519409,
465
- "eval_perplexity": 29.38113305489852,
466
- "eval_runtime": 12.1813,
467
- "eval_samples_per_second": 93.668,
468
- "eval_steps_per_second": 1.478,
469
  "step": 5000
470
  },
471
  {
472
  "epoch": 2.861952861952862,
473
- "grad_norm": 2.4099814891815186,
474
  "learning_rate": 4.751215862326974e-05,
475
- "loss": 3.3747,
476
  "step": 5100
477
  },
478
  {
479
  "epoch": 2.9180695847362514,
480
- "grad_norm": 2.5423948764801025,
481
  "learning_rate": 4.626512033919442e-05,
482
- "loss": 3.3545,
483
  "step": 5200
484
  },
485
  {
486
  "epoch": 2.974186307519641,
487
- "grad_norm": 2.864335775375366,
488
  "learning_rate": 4.5018082055119096e-05,
489
- "loss": 3.3586,
490
  "step": 5300
491
  },
492
  {
493
  "epoch": 3.0303030303030303,
494
- "grad_norm": 2.2575385570526123,
495
  "learning_rate": 4.3771043771043774e-05,
496
- "loss": 3.3028,
497
  "step": 5400
498
  },
499
  {
500
  "epoch": 3.0864197530864197,
501
- "grad_norm": 2.1973252296447754,
502
  "learning_rate": 4.252400548696845e-05,
503
- "loss": 3.2616,
504
  "step": 5500
505
  },
506
  {
507
  "epoch": 3.0864197530864197,
508
- "eval_accuracy": 0.40264623561674817,
509
- "eval_bleu": 0.12218084899348862,
510
- "eval_loss": 3.3446905612945557,
511
- "eval_perplexity": 28.35180116448892,
512
- "eval_runtime": 12.1937,
513
- "eval_samples_per_second": 93.573,
514
- "eval_steps_per_second": 1.476,
515
  "step": 5500
516
  },
517
  {
518
  "epoch": 3.142536475869809,
519
- "grad_norm": 1.4363641738891602,
520
  "learning_rate": 4.127696720289313e-05,
521
- "loss": 3.2647,
522
  "step": 5600
523
  },
524
  {
525
  "epoch": 3.1986531986531985,
526
- "grad_norm": 1.4158735275268555,
527
  "learning_rate": 4.002992891881781e-05,
528
- "loss": 3.2614,
529
  "step": 5700
530
  },
531
  {
532
  "epoch": 3.254769921436588,
533
- "grad_norm": 1.5934867858886719,
534
  "learning_rate": 3.8782890634742486e-05,
535
- "loss": 3.2573,
536
  "step": 5800
537
  },
538
  {
539
  "epoch": 3.3108866442199774,
540
- "grad_norm": 1.3518158197402954,
541
  "learning_rate": 3.7535852350667164e-05,
542
- "loss": 3.2584,
543
  "step": 5900
544
  },
545
  {
546
  "epoch": 3.3670033670033668,
547
- "grad_norm": 1.7532979249954224,
548
  "learning_rate": 3.628881406659185e-05,
549
- "loss": 3.2499,
550
  "step": 6000
551
  },
552
  {
553
  "epoch": 3.3670033670033668,
554
- "eval_accuracy": 0.4066985195027942,
555
- "eval_bleu": 0.1260881414008664,
556
- "eval_loss": 3.3095922470092773,
557
- "eval_perplexity": 27.373961381341072,
558
- "eval_runtime": 12.162,
559
- "eval_samples_per_second": 93.817,
560
- "eval_steps_per_second": 1.48,
561
  "step": 6000
562
  },
563
  {
564
  "epoch": 3.4231200897867566,
565
- "grad_norm": 1.3179600238800049,
566
  "learning_rate": 3.504177578251652e-05,
567
- "loss": 3.2306,
568
  "step": 6100
569
  },
570
  {
571
  "epoch": 3.479236812570146,
572
- "grad_norm": 3.5212039947509766,
573
  "learning_rate": 3.3794737498441205e-05,
574
- "loss": 3.2421,
575
  "step": 6200
576
  },
577
  {
578
  "epoch": 3.5353535353535355,
579
- "grad_norm": 1.3254704475402832,
580
  "learning_rate": 3.254769921436588e-05,
581
- "loss": 3.2305,
582
  "step": 6300
583
  },
584
  {
585
  "epoch": 3.591470258136925,
586
- "grad_norm": 3.1436195373535156,
587
  "learning_rate": 3.130066093029056e-05,
588
- "loss": 3.2314,
589
  "step": 6400
590
  },
591
  {
592
  "epoch": 3.6475869809203143,
593
- "grad_norm": 2.763392210006714,
594
  "learning_rate": 3.0053622646215242e-05,
595
- "loss": 3.2277,
596
  "step": 6500
597
  },
598
  {
599
  "epoch": 3.6475869809203143,
600
- "eval_accuracy": 0.41002601857539517,
601
- "eval_bleu": 0.12992587011026865,
602
- "eval_loss": 3.2811858654022217,
603
- "eval_perplexity": 26.607306683090435,
604
- "eval_runtime": 12.2242,
605
- "eval_samples_per_second": 93.34,
606
- "eval_steps_per_second": 1.472,
607
  "step": 6500
608
  },
609
  {
610
  "epoch": 3.7037037037037037,
611
- "grad_norm": 1.2504295110702515,
612
  "learning_rate": 2.880658436213992e-05,
613
- "loss": 3.219,
614
  "step": 6600
615
  },
616
  {
617
  "epoch": 3.759820426487093,
618
- "grad_norm": 1.316945195198059,
619
  "learning_rate": 2.7559546078064598e-05,
620
- "loss": 3.2109,
621
  "step": 6700
622
  },
623
  {
624
  "epoch": 3.8159371492704826,
625
- "grad_norm": 1.9210931062698364,
626
  "learning_rate": 2.6312507793989276e-05,
627
- "loss": 3.1992,
628
  "step": 6800
629
  },
630
  {
631
  "epoch": 3.872053872053872,
632
- "grad_norm": 1.6544625759124756,
633
  "learning_rate": 2.5065469509913957e-05,
634
- "loss": 3.1897,
635
  "step": 6900
636
  },
637
  {
638
  "epoch": 3.9281705948372614,
639
- "grad_norm": 1.2679674625396729,
640
  "learning_rate": 2.3818431225838632e-05,
641
- "loss": 3.1992,
642
  "step": 7000
643
  },
644
  {
645
  "epoch": 3.9281705948372614,
646
- "eval_accuracy": 0.4128223514726582,
647
- "eval_bleu": 0.1304892476483967,
648
- "eval_loss": 3.252329111099243,
649
- "eval_perplexity": 25.850478491671673,
650
- "eval_runtime": 12.1672,
651
- "eval_samples_per_second": 93.777,
652
  "eval_steps_per_second": 1.479,
653
  "step": 7000
654
  },
655
  {
656
  "epoch": 3.984287317620651,
657
- "grad_norm": 1.1226177215576172,
658
  "learning_rate": 2.2571392941763313e-05,
659
- "loss": 3.1958,
660
  "step": 7100
661
  },
662
  {
663
  "epoch": 4.040404040404041,
664
- "grad_norm": 1.0558810234069824,
665
  "learning_rate": 2.132435465768799e-05,
666
- "loss": 3.13,
667
  "step": 7200
668
  },
669
  {
670
  "epoch": 4.09652076318743,
671
- "grad_norm": 1.1988404989242554,
672
  "learning_rate": 2.007731637361267e-05,
673
- "loss": 3.1249,
674
  "step": 7300
675
  },
676
  {
677
  "epoch": 4.1526374859708195,
678
- "grad_norm": 1.914546251296997,
679
  "learning_rate": 1.883027808953735e-05,
680
- "loss": 3.1276,
681
  "step": 7400
682
  },
683
  {
684
  "epoch": 4.2087542087542085,
685
- "grad_norm": 1.7904256582260132,
686
  "learning_rate": 1.758323980546203e-05,
687
- "loss": 3.13,
688
  "step": 7500
689
  },
690
  {
691
  "epoch": 4.2087542087542085,
692
- "eval_accuracy": 0.4154396299656541,
693
- "eval_bleu": 0.13260343650795992,
694
- "eval_loss": 3.233205795288086,
695
- "eval_perplexity": 25.360828416165713,
696
- "eval_runtime": 12.2696,
697
- "eval_samples_per_second": 92.994,
698
- "eval_steps_per_second": 1.467,
699
  "step": 7500
700
  },
701
  {
702
  "epoch": 4.264870931537598,
703
- "grad_norm": 2.1336634159088135,
704
  "learning_rate": 1.6336201521386706e-05,
705
- "loss": 3.1166,
706
  "step": 7600
707
  },
708
  {
709
  "epoch": 4.320987654320987,
710
- "grad_norm": 1.4960366487503052,
711
  "learning_rate": 1.5089163237311384e-05,
712
- "loss": 3.1033,
713
  "step": 7700
714
  },
715
  {
716
  "epoch": 4.377104377104377,
717
- "grad_norm": 1.4756426811218262,
718
  "learning_rate": 1.3842124953236066e-05,
719
- "loss": 3.1145,
720
  "step": 7800
721
  },
722
  {
723
  "epoch": 4.433221099887767,
724
- "grad_norm": 1.741774082183838,
725
  "learning_rate": 1.2595086669160744e-05,
726
- "loss": 3.1115,
727
  "step": 7900
728
  },
729
  {
730
  "epoch": 4.489337822671156,
731
- "grad_norm": 1.0106322765350342,
732
  "learning_rate": 1.1348048385085423e-05,
733
- "loss": 3.0915,
734
  "step": 8000
735
  },
736
  {
737
  "epoch": 4.489337822671156,
738
- "eval_accuracy": 0.4168352262553727,
739
- "eval_bleu": 0.1317196572953092,
740
- "eval_loss": 3.220038414001465,
741
- "eval_perplexity": 25.029081630049546,
742
- "eval_runtime": 12.1779,
743
- "eval_samples_per_second": 93.694,
744
- "eval_steps_per_second": 1.478,
745
  "step": 8000
746
  },
747
  {
748
  "epoch": 4.545454545454545,
749
- "grad_norm": 1.2306262254714966,
750
  "learning_rate": 1.0101010101010101e-05,
751
- "loss": 3.1086,
752
  "step": 8100
753
  },
754
  {
755
  "epoch": 4.601571268237935,
756
- "grad_norm": 1.141204833984375,
757
  "learning_rate": 8.853971816934781e-06,
758
- "loss": 3.0995,
759
  "step": 8200
760
  },
761
  {
762
  "epoch": 4.657687991021325,
763
- "grad_norm": 2.441620349884033,
764
  "learning_rate": 7.606933532859459e-06,
765
- "loss": 3.106,
766
  "step": 8300
767
  },
768
  {
769
  "epoch": 4.713804713804714,
770
- "grad_norm": 0.9955108761787415,
771
  "learning_rate": 6.359895248784138e-06,
772
- "loss": 3.0982,
773
  "step": 8400
774
  },
775
  {
776
  "epoch": 4.7699214365881035,
777
- "grad_norm": 1.119258165359497,
778
  "learning_rate": 5.112856964708817e-06,
779
- "loss": 3.1011,
780
  "step": 8500
781
  },
782
  {
783
  "epoch": 4.7699214365881035,
784
- "eval_accuracy": 0.4185786507179739,
785
- "eval_bleu": 0.13349497596917598,
786
- "eval_loss": 3.2064151763916016,
787
- "eval_perplexity": 24.690416598736768,
788
- "eval_runtime": 12.2406,
789
- "eval_samples_per_second": 93.214,
790
- "eval_steps_per_second": 1.471,
791
  "step": 8500
792
  },
793
  {
794
  "epoch": 4.8260381593714925,
795
- "grad_norm": 1.438043236732483,
796
  "learning_rate": 3.865818680633495e-06,
797
- "loss": 3.0915,
798
  "step": 8600
799
  },
800
  {
801
  "epoch": 4.882154882154882,
802
- "grad_norm": 1.4048082828521729,
803
  "learning_rate": 2.6187803965581742e-06,
804
- "loss": 3.1015,
805
  "step": 8700
806
  },
807
  {
808
  "epoch": 4.938271604938271,
809
- "grad_norm": 1.0656932592391968,
810
  "learning_rate": 1.3717421124828533e-06,
811
- "loss": 3.0977,
812
  "step": 8800
813
  },
814
  {
815
  "epoch": 4.994388327721661,
816
- "grad_norm": 0.9609319567680359,
817
  "learning_rate": 1.2470382840753213e-07,
818
- "loss": 3.0882,
819
  "step": 8900
820
  },
821
  {
822
  "epoch": 5.0,
823
  "step": 8910,
824
  "total_flos": 1.0587061010143642e+18,
825
- "train_loss": 3.7750147520893753,
826
- "train_runtime": 15697.9119,
827
- "train_samples_per_second": 36.309,
828
- "train_steps_per_second": 0.568
829
  }
830
  ],
831
  "logging_steps": 100,
 
1
  {
2
+ "best_metric": 3.1984846591949463,
3
  "best_model_checkpoint": "./output/models/rotating-head-gp-gpt2-medium-wikitext/checkpoint-8500",
4
  "epoch": 5.0,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05611672278338945,
13
+ "grad_norm": 1.7026984691619873,
14
  "learning_rate": 1.1223344556677892e-05,
15
  "loss": 8.993,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.1122334455667789,
20
+ "grad_norm": 1.4564013481140137,
21
  "learning_rate": 2.2446689113355783e-05,
22
  "loss": 7.3838,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.16835016835016836,
27
+ "grad_norm": 1.6578171253204346,
28
  "learning_rate": 3.3670033670033675e-05,
29
+ "loss": 6.5796,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.2244668911335578,
34
+ "grad_norm": 2.0035881996154785,
35
  "learning_rate": 4.4893378226711566e-05,
36
+ "loss": 6.1995,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.28058361391694725,
41
+ "grad_norm": 1.3597742319107056,
42
  "learning_rate": 5.611672278338945e-05,
43
+ "loss": 5.9062,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.28058361391694725,
48
+ "eval_accuracy": 0.22335966032779805,
49
+ "eval_bleu": 0.0492964548196352,
50
+ "eval_loss": 5.746989727020264,
51
+ "eval_perplexity": 313.2462827607658,
52
+ "eval_runtime": 12.1113,
53
+ "eval_samples_per_second": 94.209,
54
+ "eval_steps_per_second": 1.486,
55
  "step": 500
56
  },
57
  {
58
  "epoch": 0.3367003367003367,
59
+ "grad_norm": 1.2420822381973267,
60
  "learning_rate": 6.734006734006735e-05,
61
+ "loss": 5.6583,
62
  "step": 600
63
  },
64
  {
65
  "epoch": 0.39281705948372614,
66
+ "grad_norm": 1.7358133792877197,
67
  "learning_rate": 7.856341189674523e-05,
68
+ "loss": 5.4354,
69
  "step": 700
70
  },
71
  {
72
  "epoch": 0.4489337822671156,
73
+ "grad_norm": 1.1559091806411743,
74
  "learning_rate": 8.978675645342313e-05,
75
+ "loss": 5.252,
76
  "step": 800
77
  },
78
  {
79
  "epoch": 0.5050505050505051,
80
+ "grad_norm": 1.1872116327285767,
81
  "learning_rate": 9.988776655443322e-05,
82
+ "loss": 5.0452,
83
  "step": 900
84
  },
85
  {
86
  "epoch": 0.5611672278338945,
87
+ "grad_norm": 1.0503571033477783,
88
  "learning_rate": 9.864072827035791e-05,
89
+ "loss": 4.8598,
90
  "step": 1000
91
  },
92
  {
93
  "epoch": 0.5611672278338945,
94
+ "eval_accuracy": 0.28109999374594663,
95
+ "eval_bleu": 0.06981793198064049,
96
+ "eval_loss": 4.74280309677124,
97
+ "eval_perplexity": 114.75542181664323,
98
+ "eval_runtime": 12.1357,
99
+ "eval_samples_per_second": 94.02,
100
+ "eval_steps_per_second": 1.483,
101
  "step": 1000
102
  },
103
  {
104
  "epoch": 0.6172839506172839,
105
+ "grad_norm": 0.9331828951835632,
106
  "learning_rate": 9.73936899862826e-05,
107
+ "loss": 4.7116,
108
  "step": 1100
109
  },
110
  {
111
  "epoch": 0.6734006734006734,
112
+ "grad_norm": 0.8568651676177979,
113
  "learning_rate": 9.614665170220725e-05,
114
+ "loss": 4.5871,
115
  "step": 1200
116
  },
117
  {
118
  "epoch": 0.7295173961840629,
119
+ "grad_norm": 0.7943041920661926,
120
  "learning_rate": 9.489961341813194e-05,
121
+ "loss": 4.48,
122
  "step": 1300
123
  },
124
  {
125
  "epoch": 0.7856341189674523,
126
+ "grad_norm": 0.929481029510498,
127
  "learning_rate": 9.365257513405662e-05,
128
+ "loss": 4.3936,
129
  "step": 1400
130
  },
131
  {
132
  "epoch": 0.8417508417508418,
133
+ "grad_norm": 0.9010036587715149,
134
  "learning_rate": 9.24055368499813e-05,
135
+ "loss": 4.3025,
136
  "step": 1500
137
  },
138
  {
139
  "epoch": 0.8417508417508418,
140
+ "eval_accuracy": 0.31702567503082046,
141
+ "eval_bleu": 0.08337643356737867,
142
+ "eval_loss": 4.23293399810791,
143
+ "eval_perplexity": 68.91914446331879,
144
+ "eval_runtime": 12.1629,
145
+ "eval_samples_per_second": 93.809,
146
+ "eval_steps_per_second": 1.48,
147
  "step": 1500
148
  },
149
  {
150
  "epoch": 0.8978675645342312,
151
+ "grad_norm": 0.8791123628616333,
152
  "learning_rate": 9.115849856590598e-05,
153
+ "loss": 4.2455,
154
  "step": 1600
155
  },
156
  {
157
  "epoch": 0.9539842873176206,
158
+ "grad_norm": 0.938328742980957,
159
  "learning_rate": 8.991146028183066e-05,
160
+ "loss": 4.1605,
161
  "step": 1700
162
  },
163
  {
164
  "epoch": 1.0101010101010102,
165
+ "grad_norm": 1.0388487577438354,
166
  "learning_rate": 8.866442199775533e-05,
167
+ "loss": 4.1011,
168
  "step": 1800
169
  },
170
  {
171
  "epoch": 1.0662177328843996,
172
+ "grad_norm": 0.9663709998130798,
173
  "learning_rate": 8.741738371368002e-05,
174
+ "loss": 4.0148,
175
  "step": 1900
176
  },
177
  {
178
  "epoch": 1.122334455667789,
179
+ "grad_norm": 0.8401734828948975,
180
  "learning_rate": 8.617034542960469e-05,
181
+ "loss": 3.9635,
182
  "step": 2000
183
  },
184
  {
185
  "epoch": 1.122334455667789,
186
+ "eval_accuracy": 0.34535482328872397,
187
+ "eval_bleu": 0.09320946829041966,
188
+ "eval_loss": 3.9290566444396973,
189
+ "eval_perplexity": 50.85897693319932,
190
+ "eval_runtime": 12.2142,
191
+ "eval_samples_per_second": 93.416,
192
+ "eval_steps_per_second": 1.474,
193
  "step": 2000
194
  },
195
  {
196
  "epoch": 1.1784511784511784,
197
+ "grad_norm": 1.1741111278533936,
198
  "learning_rate": 8.492330714552937e-05,
199
+ "loss": 3.9219,
200
  "step": 2100
201
  },
202
  {
203
  "epoch": 1.2345679012345678,
204
+ "grad_norm": 1.1291131973266602,
205
  "learning_rate": 8.367626886145406e-05,
206
+ "loss": 3.876,
207
  "step": 2200
208
  },
209
  {
210
  "epoch": 1.2906846240179575,
211
+ "grad_norm": 1.0407253503799438,
212
  "learning_rate": 8.242923057737873e-05,
213
+ "loss": 3.8557,
214
  "step": 2300
215
  },
216
  {
217
  "epoch": 1.3468013468013469,
218
+ "grad_norm": 1.0766143798828125,
219
  "learning_rate": 8.11821922933034e-05,
220
+ "loss": 3.8141,
221
  "step": 2400
222
  },
223
  {
224
  "epoch": 1.4029180695847363,
225
+ "grad_norm": 1.42284095287323,
226
  "learning_rate": 7.993515400922809e-05,
227
+ "loss": 3.7769,
228
  "step": 2500
229
  },
230
  {
231
  "epoch": 1.4029180695847363,
232
+ "eval_accuracy": 0.36358581717774274,
233
+ "eval_bleu": 0.10197996736893164,
234
+ "eval_loss": 3.7426531314849854,
235
+ "eval_perplexity": 42.209829965401774,
236
+ "eval_runtime": 12.1768,
237
+ "eval_samples_per_second": 93.703,
238
+ "eval_steps_per_second": 1.478,
239
  "step": 2500
240
  },
241
  {
242
  "epoch": 1.4590347923681257,
243
+ "grad_norm": 1.3685775995254517,
244
  "learning_rate": 7.868811572515277e-05,
245
+ "loss": 3.7592,
246
  "step": 2600
247
  },
248
  {
249
  "epoch": 1.5151515151515151,
250
+ "grad_norm": 1.6901907920837402,
251
  "learning_rate": 7.744107744107744e-05,
252
+ "loss": 3.7311,
253
  "step": 2700
254
  },
255
  {
256
  "epoch": 1.5712682379349046,
257
+ "grad_norm": 1.2826731204986572,
258
  "learning_rate": 7.619403915700213e-05,
259
+ "loss": 3.6989,
260
  "step": 2800
261
  },
262
  {
263
  "epoch": 1.627384960718294,
264
+ "grad_norm": 1.1707065105438232,
265
  "learning_rate": 7.49470008729268e-05,
266
+ "loss": 3.6858,
267
  "step": 2900
268
  },
269
  {
270
  "epoch": 1.6835016835016834,
271
+ "grad_norm": 1.8538202047348022,
272
  "learning_rate": 7.369996258885148e-05,
273
+ "loss": 3.6738,
274
  "step": 3000
275
  },
276
  {
277
  "epoch": 1.6835016835016834,
278
+ "eval_accuracy": 0.3753888436255347,
279
+ "eval_bleu": 0.1065937514211335,
280
+ "eval_loss": 3.6224589347839355,
281
+ "eval_perplexity": 37.429491437089155,
282
+ "eval_runtime": 12.1708,
283
+ "eval_samples_per_second": 93.749,
284
+ "eval_steps_per_second": 1.479,
285
  "step": 3000
286
  },
287
  {
288
  "epoch": 1.7396184062850728,
289
+ "grad_norm": 1.2428492307662964,
290
  "learning_rate": 7.245292430477615e-05,
291
+ "loss": 3.6346,
292
  "step": 3100
293
  },
294
  {
295
  "epoch": 1.7957351290684624,
296
+ "grad_norm": 1.5533177852630615,
297
  "learning_rate": 7.120588602070084e-05,
298
+ "loss": 3.6228,
299
  "step": 3200
300
  },
301
  {
302
  "epoch": 1.8518518518518519,
303
+ "grad_norm": 1.538769006729126,
304
  "learning_rate": 6.995884773662552e-05,
305
+ "loss": 3.5964,
306
  "step": 3300
307
  },
308
  {
309
  "epoch": 1.9079685746352413,
310
+ "grad_norm": 0.9837027788162231,
311
  "learning_rate": 6.871180945255021e-05,
312
+ "loss": 3.5916,
313
  "step": 3400
314
  },
315
  {
316
  "epoch": 1.964085297418631,
317
+ "grad_norm": 1.3107187747955322,
318
  "learning_rate": 6.746477116847487e-05,
319
+ "loss": 3.5744,
320
  "step": 3500
321
  },
322
  {
323
  "epoch": 1.964085297418631,
324
+ "eval_accuracy": 0.38449063305584186,
325
+ "eval_bleu": 0.11184212819864767,
326
+ "eval_loss": 3.5325236320495605,
327
+ "eval_perplexity": 34.21019270752698,
328
+ "eval_runtime": 12.1937,
329
+ "eval_samples_per_second": 93.573,
330
+ "eval_steps_per_second": 1.476,
331
  "step": 3500
332
  },
333
  {
334
  "epoch": 2.0202020202020203,
335
+ "grad_norm": 1.3989722728729248,
336
  "learning_rate": 6.621773288439955e-05,
337
+ "loss": 3.5381,
338
  "step": 3600
339
  },
340
  {
341
  "epoch": 2.0763187429854097,
342
+ "grad_norm": 1.5801029205322266,
343
  "learning_rate": 6.497069460032424e-05,
344
+ "loss": 3.4865,
345
  "step": 3700
346
  },
347
  {
348
  "epoch": 2.132435465768799,
349
+ "grad_norm": 1.2122889757156372,
350
  "learning_rate": 6.372365631624892e-05,
351
+ "loss": 3.4665,
352
  "step": 3800
353
  },
354
  {
355
  "epoch": 2.1885521885521886,
356
+ "grad_norm": 2.2837603092193604,
357
  "learning_rate": 6.247661803217359e-05,
358
+ "loss": 3.4675,
359
  "step": 3900
360
  },
361
  {
362
  "epoch": 2.244668911335578,
363
+ "grad_norm": 1.1615939140319824,
364
  "learning_rate": 6.122957974809826e-05,
365
+ "loss": 3.456,
366
  "step": 4000
367
  },
368
  {
369
  "epoch": 2.244668911335578,
370
+ "eval_accuracy": 0.3902143769549271,
371
+ "eval_bleu": 0.11388067737811994,
372
+ "eval_loss": 3.4704020023345947,
373
+ "eval_perplexity": 32.149664087333434,
374
+ "eval_runtime": 12.121,
375
+ "eval_samples_per_second": 94.134,
376
+ "eval_steps_per_second": 1.485,
377
  "step": 4000
378
  },
379
  {
380
  "epoch": 2.3007856341189674,
381
+ "grad_norm": 0.9502741694450378,
382
  "learning_rate": 5.998254146402295e-05,
383
+ "loss": 3.4467,
384
  "step": 4100
385
  },
386
  {
387
  "epoch": 2.356902356902357,
388
+ "grad_norm": 1.7536747455596924,
389
  "learning_rate": 5.8735503179947625e-05,
390
+ "loss": 3.4298,
391
  "step": 4200
392
  },
393
  {
394
  "epoch": 2.4130190796857462,
395
+ "grad_norm": 1.4674713611602783,
396
  "learning_rate": 5.748846489587231e-05,
397
+ "loss": 3.4221,
398
  "step": 4300
399
  },
400
  {
401
  "epoch": 2.4691358024691357,
402
+ "grad_norm": 2.0394678115844727,
403
  "learning_rate": 5.624142661179699e-05,
404
+ "loss": 3.4072,
405
  "step": 4400
406
  },
407
  {
408
  "epoch": 2.525252525252525,
409
+ "grad_norm": 2.8717079162597656,
410
  "learning_rate": 5.4994388327721666e-05,
411
+ "loss": 3.3972,
412
  "step": 4500
413
  },
414
  {
415
  "epoch": 2.525252525252525,
416
+ "eval_accuracy": 0.3955106177548291,
417
+ "eval_bleu": 0.12298741482321522,
418
+ "eval_loss": 3.4189839363098145,
419
+ "eval_perplexity": 30.53837032278329,
420
+ "eval_runtime": 12.1708,
421
+ "eval_samples_per_second": 93.749,
422
+ "eval_steps_per_second": 1.479,
423
  "step": 4500
424
  },
425
  {
426
  "epoch": 2.581369248035915,
427
+ "grad_norm": 2.2189624309539795,
428
  "learning_rate": 5.374735004364634e-05,
429
+ "loss": 3.4006,
430
  "step": 4600
431
  },
432
  {
433
  "epoch": 2.637485970819304,
434
+ "grad_norm": 1.444754719734192,
435
  "learning_rate": 5.250031175957102e-05,
436
+ "loss": 3.3886,
437
  "step": 4700
438
  },
439
  {
440
  "epoch": 2.6936026936026938,
441
+ "grad_norm": 1.8333204984664917,
442
  "learning_rate": 5.12532734754957e-05,
443
+ "loss": 3.3813,
444
  "step": 4800
445
  },
446
  {
447
  "epoch": 2.749719416386083,
448
+ "grad_norm": 2.1033811569213867,
449
  "learning_rate": 5.000623519142038e-05,
450
+ "loss": 3.372,
451
  "step": 4900
452
  },
453
  {
454
  "epoch": 2.8058361391694726,
455
+ "grad_norm": 1.8956849575042725,
456
  "learning_rate": 4.8759196907345056e-05,
457
+ "loss": 3.3654,
458
  "step": 5000
459
  },
460
  {
461
  "epoch": 2.8058361391694726,
462
+ "eval_accuracy": 0.40071690299277873,
463
+ "eval_bleu": 0.12304297750670024,
464
+ "eval_loss": 3.368644952774048,
465
+ "eval_perplexity": 29.039150964630583,
466
+ "eval_runtime": 12.2072,
467
+ "eval_samples_per_second": 93.469,
468
+ "eval_steps_per_second": 1.475,
469
  "step": 5000
470
  },
471
  {
472
  "epoch": 2.861952861952862,
473
+ "grad_norm": 2.0555260181427,
474
  "learning_rate": 4.751215862326974e-05,
475
+ "loss": 3.3622,
476
  "step": 5100
477
  },
478
  {
479
  "epoch": 2.9180695847362514,
480
+ "grad_norm": 1.1492657661437988,
481
  "learning_rate": 4.626512033919442e-05,
482
+ "loss": 3.3413,
483
  "step": 5200
484
  },
485
  {
486
  "epoch": 2.974186307519641,
487
+ "grad_norm": 2.6185925006866455,
488
  "learning_rate": 4.5018082055119096e-05,
489
+ "loss": 3.3452,
490
  "step": 5300
491
  },
492
  {
493
  "epoch": 3.0303030303030303,
494
+ "grad_norm": 1.4890856742858887,
495
  "learning_rate": 4.3771043771043774e-05,
496
+ "loss": 3.2908,
497
  "step": 5400
498
  },
499
  {
500
  "epoch": 3.0864197530864197,
501
+ "grad_norm": 2.316535711288452,
502
  "learning_rate": 4.252400548696845e-05,
503
+ "loss": 3.247,
504
  "step": 5500
505
  },
506
  {
507
  "epoch": 3.0864197530864197,
508
+ "eval_accuracy": 0.40426800589080425,
509
+ "eval_bleu": 0.1247222065482489,
510
+ "eval_loss": 3.3328051567077637,
511
+ "eval_perplexity": 28.016823154790686,
512
+ "eval_runtime": 12.2011,
513
+ "eval_samples_per_second": 93.516,
514
+ "eval_steps_per_second": 1.475,
515
  "step": 5500
516
  },
517
  {
518
  "epoch": 3.142536475869809,
519
+ "grad_norm": 1.5401073694229126,
520
  "learning_rate": 4.127696720289313e-05,
521
+ "loss": 3.253,
522
  "step": 5600
523
  },
524
  {
525
  "epoch": 3.1986531986531985,
526
+ "grad_norm": 1.508957028388977,
527
  "learning_rate": 4.002992891881781e-05,
528
+ "loss": 3.2477,
529
  "step": 5700
530
  },
531
  {
532
  "epoch": 3.254769921436588,
533
+ "grad_norm": 1.5511479377746582,
534
  "learning_rate": 3.8782890634742486e-05,
535
+ "loss": 3.2456,
536
  "step": 5800
537
  },
538
  {
539
  "epoch": 3.3108866442199774,
540
+ "grad_norm": 1.5875085592269897,
541
  "learning_rate": 3.7535852350667164e-05,
542
+ "loss": 3.2472,
543
  "step": 5900
544
  },
545
  {
546
  "epoch": 3.3670033670033668,
547
+ "grad_norm": 1.1592992544174194,
548
  "learning_rate": 3.628881406659185e-05,
549
+ "loss": 3.2403,
550
  "step": 6000
551
  },
552
  {
553
  "epoch": 3.3670033670033668,
554
+ "eval_accuracy": 0.40832457337503847,
555
+ "eval_bleu": 0.129799477764246,
556
+ "eval_loss": 3.298476457595825,
557
+ "eval_perplexity": 27.07136311627636,
558
+ "eval_runtime": 12.1774,
559
+ "eval_samples_per_second": 93.699,
560
+ "eval_steps_per_second": 1.478,
561
  "step": 6000
562
  },
563
  {
564
  "epoch": 3.4231200897867566,
565
+ "grad_norm": 1.2478611469268799,
566
  "learning_rate": 3.504177578251652e-05,
567
+ "loss": 3.2187,
568
  "step": 6100
569
  },
570
  {
571
  "epoch": 3.479236812570146,
572
+ "grad_norm": 1.8688626289367676,
573
  "learning_rate": 3.3794737498441205e-05,
574
+ "loss": 3.2313,
575
  "step": 6200
576
  },
577
  {
578
  "epoch": 3.5353535353535355,
579
+ "grad_norm": 1.8166719675064087,
580
  "learning_rate": 3.254769921436588e-05,
581
+ "loss": 3.2192,
582
  "step": 6300
583
  },
584
  {
585
  "epoch": 3.591470258136925,
586
+ "grad_norm": 1.6677237749099731,
587
  "learning_rate": 3.130066093029056e-05,
588
+ "loss": 3.221,
589
  "step": 6400
590
  },
591
  {
592
  "epoch": 3.6475869809203143,
593
+ "grad_norm": 1.4927235841751099,
594
  "learning_rate": 3.0053622646215242e-05,
595
+ "loss": 3.2167,
596
  "step": 6500
597
  },
598
  {
599
  "epoch": 3.6475869809203143,
600
+ "eval_accuracy": 0.4111534616185319,
601
+ "eval_bleu": 0.1288262427188711,
602
+ "eval_loss": 3.269272565841675,
603
+ "eval_perplexity": 26.292206536048766,
604
+ "eval_runtime": 12.1618,
605
+ "eval_samples_per_second": 93.819,
606
+ "eval_steps_per_second": 1.48,
607
  "step": 6500
608
  },
609
  {
610
  "epoch": 3.7037037037037037,
611
+ "grad_norm": 1.9368300437927246,
612
  "learning_rate": 2.880658436213992e-05,
613
+ "loss": 3.2084,
614
  "step": 6600
615
  },
616
  {
617
  "epoch": 3.759820426487093,
618
+ "grad_norm": 1.1655679941177368,
619
  "learning_rate": 2.7559546078064598e-05,
620
+ "loss": 3.1997,
621
  "step": 6700
622
  },
623
  {
624
  "epoch": 3.8159371492704826,
625
+ "grad_norm": 1.2166575193405151,
626
  "learning_rate": 2.6312507793989276e-05,
627
+ "loss": 3.1889,
628
  "step": 6800
629
  },
630
  {
631
  "epoch": 3.872053872053872,
632
+ "grad_norm": 1.2495133876800537,
633
  "learning_rate": 2.5065469509913957e-05,
634
+ "loss": 3.1814,
635
  "step": 6900
636
  },
637
  {
638
  "epoch": 3.9281705948372614,
639
+ "grad_norm": 1.4042552709579468,
640
  "learning_rate": 2.3818431225838632e-05,
641
+ "loss": 3.1903,
642
  "step": 7000
643
  },
644
  {
645
  "epoch": 3.9281705948372614,
646
+ "eval_accuracy": 0.41343918961176035,
647
+ "eval_bleu": 0.13053470151276936,
648
+ "eval_loss": 3.2455883026123047,
649
+ "eval_perplexity": 25.67681135292233,
650
+ "eval_runtime": 12.1742,
651
+ "eval_samples_per_second": 93.723,
652
  "eval_steps_per_second": 1.479,
653
  "step": 7000
654
  },
655
  {
656
  "epoch": 3.984287317620651,
657
+ "grad_norm": 1.472741723060608,
658
  "learning_rate": 2.2571392941763313e-05,
659
+ "loss": 3.1863,
660
  "step": 7100
661
  },
662
  {
663
  "epoch": 4.040404040404041,
664
+ "grad_norm": 1.0730637311935425,
665
  "learning_rate": 2.132435465768799e-05,
666
+ "loss": 3.1197,
667
  "step": 7200
668
  },
669
  {
670
  "epoch": 4.09652076318743,
671
+ "grad_norm": 1.275009036064148,
672
  "learning_rate": 2.007731637361267e-05,
673
+ "loss": 3.1142,
674
  "step": 7300
675
  },
676
  {
677
  "epoch": 4.1526374859708195,
678
+ "grad_norm": 1.156083583831787,
679
  "learning_rate": 1.883027808953735e-05,
680
+ "loss": 3.1174,
681
  "step": 7400
682
  },
683
  {
684
  "epoch": 4.2087542087542085,
685
+ "grad_norm": 1.0556427240371704,
686
  "learning_rate": 1.758323980546203e-05,
687
+ "loss": 3.1212,
688
  "step": 7500
689
  },
690
  {
691
  "epoch": 4.2087542087542085,
692
+ "eval_accuracy": 0.4161310027132311,
693
+ "eval_bleu": 0.1325477390562919,
694
+ "eval_loss": 3.2261738777160645,
695
+ "eval_perplexity": 25.1831187134555,
696
+ "eval_runtime": 12.1902,
697
+ "eval_samples_per_second": 93.6,
698
+ "eval_steps_per_second": 1.477,
699
  "step": 7500
700
  },
701
  {
702
  "epoch": 4.264870931537598,
703
+ "grad_norm": 2.7228903770446777,
704
  "learning_rate": 1.6336201521386706e-05,
705
+ "loss": 3.1073,
706
  "step": 7600
707
  },
708
  {
709
  "epoch": 4.320987654320987,
710
+ "grad_norm": 1.526665210723877,
711
  "learning_rate": 1.5089163237311384e-05,
712
+ "loss": 3.0936,
713
  "step": 7700
714
  },
715
  {
716
  "epoch": 4.377104377104377,
717
+ "grad_norm": 1.0413861274719238,
718
  "learning_rate": 1.3842124953236066e-05,
719
+ "loss": 3.104,
720
  "step": 7800
721
  },
722
  {
723
  "epoch": 4.433221099887767,
724
+ "grad_norm": 1.394884467124939,
725
  "learning_rate": 1.2595086669160744e-05,
726
+ "loss": 3.1018,
727
  "step": 7900
728
  },
729
  {
730
  "epoch": 4.489337822671156,
731
+ "grad_norm": 1.2792820930480957,
732
  "learning_rate": 1.1348048385085423e-05,
733
+ "loss": 3.0816,
734
  "step": 8000
735
  },
736
  {
737
  "epoch": 4.489337822671156,
738
+ "eval_accuracy": 0.4176114142470762,
739
+ "eval_bleu": 0.13073000122523024,
740
+ "eval_loss": 3.2127764225006104,
741
+ "eval_perplexity": 24.847979030639497,
742
+ "eval_runtime": 12.2049,
743
+ "eval_samples_per_second": 93.487,
744
+ "eval_steps_per_second": 1.475,
745
  "step": 8000
746
  },
747
  {
748
  "epoch": 4.545454545454545,
749
+ "grad_norm": 1.173828363418579,
750
  "learning_rate": 1.0101010101010101e-05,
751
+ "loss": 3.0993,
752
  "step": 8100
753
  },
754
  {
755
  "epoch": 4.601571268237935,
756
+ "grad_norm": 1.09369957447052,
757
  "learning_rate": 8.853971816934781e-06,
758
+ "loss": 3.0905,
759
  "step": 8200
760
  },
761
  {
762
  "epoch": 4.657687991021325,
763
+ "grad_norm": 1.0657522678375244,
764
  "learning_rate": 7.606933532859459e-06,
765
+ "loss": 3.097,
766
  "step": 8300
767
  },
768
  {
769
  "epoch": 4.713804713804714,
770
+ "grad_norm": 1.1146267652511597,
771
  "learning_rate": 6.359895248784138e-06,
772
+ "loss": 3.0896,
773
  "step": 8400
774
  },
775
  {
776
  "epoch": 4.7699214365881035,
777
+ "grad_norm": 2.1886322498321533,
778
  "learning_rate": 5.112856964708817e-06,
779
+ "loss": 3.0917,
780
  "step": 8500
781
  },
782
  {
783
  "epoch": 4.7699214365881035,
784
+ "eval_accuracy": 0.4195613081423491,
785
+ "eval_bleu": 0.13385511134755296,
786
+ "eval_loss": 3.1984846591949463,
787
+ "eval_perplexity": 24.49538320533492,
788
+ "eval_runtime": 12.1678,
789
+ "eval_samples_per_second": 93.772,
790
+ "eval_steps_per_second": 1.479,
791
  "step": 8500
792
  },
793
  {
794
  "epoch": 4.8260381593714925,
795
+ "grad_norm": 1.1058974266052246,
796
  "learning_rate": 3.865818680633495e-06,
797
+ "loss": 3.0822,
798
  "step": 8600
799
  },
800
  {
801
  "epoch": 4.882154882154882,
802
+ "grad_norm": 1.3942530155181885,
803
  "learning_rate": 2.6187803965581742e-06,
804
+ "loss": 3.0923,
805
  "step": 8700
806
  },
807
  {
808
  "epoch": 4.938271604938271,
809
+ "grad_norm": 1.3995310068130493,
810
  "learning_rate": 1.3717421124828533e-06,
811
+ "loss": 3.088,
812
  "step": 8800
813
  },
814
  {
815
  "epoch": 4.994388327721661,
816
+ "grad_norm": 1.1968387365341187,
817
  "learning_rate": 1.2470382840753213e-07,
818
+ "loss": 3.0789,
819
  "step": 8900
820
  },
821
  {
822
  "epoch": 5.0,
823
  "step": 8910,
824
  "total_flos": 1.0587061010143642e+18,
825
+ "train_loss": 3.767289323945907,
826
+ "train_runtime": 15738.6672,
827
+ "train_samples_per_second": 36.215,
828
+ "train_steps_per_second": 0.566
829
  }
830
  ],
831
  "logging_steps": 100,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9698132b7711dafc8a37071c574a7e35f5549f06a7d2a7c7a9c45ef5912876ca
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1531590f5a306e0256e546c80e2541d64ed78f3ae0db27f5072411a0a6af9e8
3
  size 5560