hllj commited on
Commit
89d931f
·
1 Parent(s): 96f27a5

Model save

Browse files
README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: HuggingFaceH4/zephyr-7b-beta
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: non-qa-sft-zephyr-7b-beta-v1
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # non-qa-sft-zephyr-7b-beta-v1
15
+
16
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 0.5768
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 3e-05
38
+ - train_batch_size: 4
39
+ - eval_batch_size: 4
40
+ - seed: 42
41
+ - distributed_type: multi-GPU
42
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_ratio: 0.05
45
+ - training_steps: 1000
46
+ - mixed_precision_training: Native AMP
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss |
51
+ |:-------------:|:-----:|:----:|:---------------:|
52
+ | 1.3178 | 0.03 | 50 | 1.0767 |
53
+ | 0.7765 | 0.07 | 100 | 0.7130 |
54
+ | 0.6491 | 0.1 | 150 | 0.6840 |
55
+ | 0.6441 | 0.14 | 200 | 0.6829 |
56
+ | 0.701 | 0.17 | 250 | 0.6642 |
57
+ | 0.6936 | 0.21 | 300 | 0.6427 |
58
+ | 0.6538 | 0.24 | 350 | 0.6175 |
59
+ | 0.5927 | 0.27 | 400 | 0.6139 |
60
+ | 0.6709 | 0.31 | 450 | 0.6129 |
61
+ | 0.5961 | 0.34 | 500 | 0.6078 |
62
+ | 0.6161 | 0.38 | 550 | 0.5956 |
63
+ | 0.5999 | 0.41 | 600 | 0.5938 |
64
+ | 0.6248 | 0.44 | 650 | 0.5824 |
65
+ | 0.6494 | 0.48 | 700 | 0.5806 |
66
+ | 0.6259 | 0.51 | 750 | 0.5767 |
67
+ | 0.557 | 0.55 | 800 | 0.5762 |
68
+ | 0.6215 | 0.58 | 850 | 0.5777 |
69
+ | 0.5986 | 0.62 | 900 | 0.5770 |
70
+ | 0.6224 | 0.65 | 950 | 0.5767 |
71
+ | 0.6058 | 0.68 | 1000 | 0.5768 |
72
+
73
+
74
+ ### Framework versions
75
+
76
+ - Transformers 4.35.2
77
+ - Pytorch 2.1.0
78
+ - Datasets 2.15.0
79
+ - Tokenizers 0.15.0
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.68,
3
+ "eval_loss": 0.5767720937728882,
4
+ "eval_runtime": 137.0943,
5
+ "eval_samples": 650,
6
+ "eval_samples_per_second": 4.741,
7
+ "eval_steps_per_second": 1.189,
8
+ "train_loss": 0.6960326323509216,
9
+ "train_runtime": 5130.9272,
10
+ "train_samples": 5845,
11
+ "train_samples_per_second": 0.78,
12
+ "train_steps_per_second": 0.195
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.68,
3
+ "eval_loss": 0.5767720937728882,
4
+ "eval_runtime": 137.0943,
5
+ "eval_samples": 650,
6
+ "eval_samples_per_second": 4.741,
7
+ "eval_steps_per_second": 1.189
8
+ }
runs/Nov20_16-52-06_fd13c61a4556/events.out.tfevents.1700504426.fd13c61a4556.3859.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09ced0301188c2d96f525c113865307bc35c927aa520314b1d1aa5f549a7457f
3
+ size 359
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.68,
3
+ "train_loss": 0.6960326323509216,
4
+ "train_runtime": 5130.9272,
5
+ "train_samples": 5845,
6
+ "train_samples_per_second": 0.78,
7
+ "train_steps_per_second": 0.195
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,794 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.6839945280437757,
5
+ "eval_steps": 50,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 0.0,
14
+ "loss": 1.7769,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.01,
19
+ "learning_rate": 4.800000000000001e-06,
20
+ "loss": 1.4834,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.01,
25
+ "learning_rate": 1.02e-05,
26
+ "loss": 1.6171,
27
+ "step": 20
28
+ },
29
+ {
30
+ "epoch": 0.02,
31
+ "learning_rate": 1.62e-05,
32
+ "loss": 1.5372,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.03,
37
+ "learning_rate": 2.22e-05,
38
+ "loss": 1.618,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.03,
43
+ "learning_rate": 2.8199999999999998e-05,
44
+ "loss": 1.3178,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.03,
49
+ "eval_loss": 1.0766865015029907,
50
+ "eval_runtime": 137.8647,
51
+ "eval_samples_per_second": 4.715,
52
+ "eval_steps_per_second": 1.182,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 0.04,
57
+ "learning_rate": 2.999704741743589e-05,
58
+ "loss": 1.1333,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 0.05,
63
+ "learning_rate": 2.9979008066540737e-05,
64
+ "loss": 0.9023,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.05,
69
+ "learning_rate": 2.9944589390244404e-05,
70
+ "loss": 0.9608,
71
+ "step": 80
72
+ },
73
+ {
74
+ "epoch": 0.06,
75
+ "learning_rate": 2.9893829024864087e-05,
76
+ "loss": 0.8594,
77
+ "step": 90
78
+ },
79
+ {
80
+ "epoch": 0.07,
81
+ "learning_rate": 2.9826782476114073e-05,
82
+ "loss": 0.7765,
83
+ "step": 100
84
+ },
85
+ {
86
+ "epoch": 0.07,
87
+ "eval_loss": 0.7130317091941833,
88
+ "eval_runtime": 137.4102,
89
+ "eval_samples_per_second": 4.73,
90
+ "eval_steps_per_second": 1.186,
91
+ "step": 100
92
+ },
93
+ {
94
+ "epoch": 0.08,
95
+ "learning_rate": 2.9743523058411057e-05,
96
+ "loss": 0.8438,
97
+ "step": 110
98
+ },
99
+ {
100
+ "epoch": 0.08,
101
+ "learning_rate": 2.9644141814705893e-05,
102
+ "loss": 0.7234,
103
+ "step": 120
104
+ },
105
+ {
106
+ "epoch": 0.09,
107
+ "learning_rate": 2.9528747416929467e-05,
108
+ "loss": 0.6949,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.1,
113
+ "learning_rate": 2.939746604716155e-05,
114
+ "loss": 0.7523,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.1,
119
+ "learning_rate": 2.925044125965253e-05,
120
+ "loss": 0.6491,
121
+ "step": 150
122
+ },
123
+ {
124
+ "epoch": 0.1,
125
+ "eval_loss": 0.6840489506721497,
126
+ "eval_runtime": 137.3722,
127
+ "eval_samples_per_second": 4.732,
128
+ "eval_steps_per_second": 1.187,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.11,
133
+ "learning_rate": 2.9087833823848947e-05,
134
+ "loss": 0.677,
135
+ "step": 160
136
+ },
137
+ {
138
+ "epoch": 0.12,
139
+ "learning_rate": 2.890982154859448e-05,
140
+ "loss": 0.7101,
141
+ "step": 170
142
+ },
143
+ {
144
+ "epoch": 0.12,
145
+ "learning_rate": 2.8716599087698565e-05,
146
+ "loss": 0.7427,
147
+ "step": 180
148
+ },
149
+ {
150
+ "epoch": 0.13,
151
+ "learning_rate": 2.8508377727085337e-05,
152
+ "loss": 0.7108,
153
+ "step": 190
154
+ },
155
+ {
156
+ "epoch": 0.14,
157
+ "learning_rate": 2.8285385153755532e-05,
158
+ "loss": 0.6441,
159
+ "step": 200
160
+ },
161
+ {
162
+ "epoch": 0.14,
163
+ "eval_loss": 0.6828967928886414,
164
+ "eval_runtime": 137.6745,
165
+ "eval_samples_per_second": 4.721,
166
+ "eval_steps_per_second": 1.184,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.14,
171
+ "learning_rate": 2.8047865206814164e-05,
172
+ "loss": 0.6786,
173
+ "step": 210
174
+ },
175
+ {
176
+ "epoch": 0.15,
177
+ "learning_rate": 2.779607761083596e-05,
178
+ "loss": 0.6927,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.16,
183
+ "learning_rate": 2.7530297691860436e-05,
184
+ "loss": 0.688,
185
+ "step": 230
186
+ },
187
+ {
188
+ "epoch": 0.16,
189
+ "learning_rate": 2.7250816076326834e-05,
190
+ "loss": 0.6965,
191
+ "step": 240
192
+ },
193
+ {
194
+ "epoch": 0.17,
195
+ "learning_rate": 2.695793837327844e-05,
196
+ "loss": 0.701,
197
+ "step": 250
198
+ },
199
+ {
200
+ "epoch": 0.17,
201
+ "eval_loss": 0.6642373204231262,
202
+ "eval_runtime": 137.4893,
203
+ "eval_samples_per_second": 4.728,
204
+ "eval_steps_per_second": 1.186,
205
+ "step": 250
206
+ },
207
+ {
208
+ "epoch": 0.18,
209
+ "learning_rate": 2.6651984840183545e-05,
210
+ "loss": 0.7585,
211
+ "step": 260
212
+ },
213
+ {
214
+ "epoch": 0.18,
215
+ "learning_rate": 2.6333290032738626e-05,
216
+ "loss": 0.6551,
217
+ "step": 270
218
+ },
219
+ {
220
+ "epoch": 0.19,
221
+ "learning_rate": 2.60022024390366e-05,
222
+ "loss": 0.656,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 0.2,
227
+ "learning_rate": 2.565908409850019e-05,
228
+ "loss": 0.6265,
229
+ "step": 290
230
+ },
231
+ {
232
+ "epoch": 0.21,
233
+ "learning_rate": 2.5304310205997168e-05,
234
+ "loss": 0.6936,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 0.21,
239
+ "eval_loss": 0.6427347660064697,
240
+ "eval_runtime": 137.4455,
241
+ "eval_samples_per_second": 4.729,
242
+ "eval_steps_per_second": 1.186,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.21,
247
+ "learning_rate": 2.4938268701570245e-05,
248
+ "loss": 0.6359,
249
+ "step": 310
250
+ },
251
+ {
252
+ "epoch": 0.22,
253
+ "learning_rate": 2.4561359846230346e-05,
254
+ "loss": 0.654,
255
+ "step": 320
256
+ },
257
+ {
258
+ "epoch": 0.23,
259
+ "learning_rate": 2.4173995784277065e-05,
260
+ "loss": 0.6803,
261
+ "step": 330
262
+ },
263
+ {
264
+ "epoch": 0.23,
265
+ "learning_rate": 2.3776600092624925e-05,
266
+ "loss": 0.6788,
267
+ "step": 340
268
+ },
269
+ {
270
+ "epoch": 0.24,
271
+ "learning_rate": 2.3369607317628244e-05,
272
+ "loss": 0.6538,
273
+ "step": 350
274
+ },
275
+ {
276
+ "epoch": 0.24,
277
+ "eval_loss": 0.6175208687782288,
278
+ "eval_runtime": 137.5627,
279
+ "eval_samples_per_second": 4.725,
280
+ "eval_steps_per_second": 1.185,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 0.25,
285
+ "learning_rate": 2.2953462499911072e-05,
286
+ "loss": 0.5896,
287
+ "step": 360
288
+ },
289
+ {
290
+ "epoch": 0.25,
291
+ "learning_rate": 2.2528620687721802e-05,
292
+ "loss": 0.6616,
293
+ "step": 370
294
+ },
295
+ {
296
+ "epoch": 0.26,
297
+ "learning_rate": 2.2095546439344614e-05,
298
+ "loss": 0.6206,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 0.27,
303
+ "learning_rate": 2.165471331511176e-05,
304
+ "loss": 0.6473,
305
+ "step": 390
306
+ },
307
+ {
308
+ "epoch": 0.27,
309
+ "learning_rate": 2.1206603359572346e-05,
310
+ "loss": 0.5927,
311
+ "step": 400
312
+ },
313
+ {
314
+ "epoch": 0.27,
315
+ "eval_loss": 0.6138765215873718,
316
+ "eval_runtime": 137.4362,
317
+ "eval_samples_per_second": 4.729,
318
+ "eval_steps_per_second": 1.186,
319
+ "step": 400
320
+ },
321
+ {
322
+ "epoch": 0.28,
323
+ "learning_rate": 2.0751706574383676e-05,
324
+ "loss": 0.658,
325
+ "step": 410
326
+ },
327
+ {
328
+ "epoch": 0.29,
329
+ "learning_rate": 2.029052038250162e-05,
330
+ "loss": 0.6825,
331
+ "step": 420
332
+ },
333
+ {
334
+ "epoch": 0.29,
335
+ "learning_rate": 1.982354908425593e-05,
336
+ "loss": 0.6424,
337
+ "step": 430
338
+ },
339
+ {
340
+ "epoch": 0.3,
341
+ "learning_rate": 1.935130330590525e-05,
342
+ "loss": 0.6188,
343
+ "step": 440
344
+ },
345
+ {
346
+ "epoch": 0.31,
347
+ "learning_rate": 1.887429944127475e-05,
348
+ "loss": 0.6709,
349
+ "step": 450
350
+ },
351
+ {
352
+ "epoch": 0.31,
353
+ "eval_loss": 0.6129215955734253,
354
+ "eval_runtime": 137.6096,
355
+ "eval_samples_per_second": 4.724,
356
+ "eval_steps_per_second": 1.185,
357
+ "step": 450
358
+ },
359
+ {
360
+ "epoch": 0.31,
361
+ "learning_rate": 1.8393059087087106e-05,
362
+ "loss": 0.649,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 0.32,
367
+ "learning_rate": 1.7908108472604124e-05,
368
+ "loss": 0.5975,
369
+ "step": 470
370
+ },
371
+ {
372
+ "epoch": 0.33,
373
+ "learning_rate": 1.7419977884202765e-05,
374
+ "loss": 0.6119,
375
+ "step": 480
376
+ },
377
+ {
378
+ "epoch": 0.34,
379
+ "learning_rate": 1.6929201085514793e-05,
380
+ "loss": 0.5895,
381
+ "step": 490
382
+ },
383
+ {
384
+ "epoch": 0.34,
385
+ "learning_rate": 1.643631473376405e-05,
386
+ "loss": 0.5961,
387
+ "step": 500
388
+ },
389
+ {
390
+ "epoch": 0.34,
391
+ "eval_loss": 0.6077620983123779,
392
+ "eval_runtime": 137.6715,
393
+ "eval_samples_per_second": 4.721,
394
+ "eval_steps_per_second": 1.184,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 0.35,
399
+ "learning_rate": 1.5941857792939702e-05,
400
+ "loss": 0.6895,
401
+ "step": 510
402
+ },
403
+ {
404
+ "epoch": 0.36,
405
+ "learning_rate": 1.5446370944446987e-05,
406
+ "loss": 0.6227,
407
+ "step": 520
408
+ },
409
+ {
410
+ "epoch": 0.36,
411
+ "learning_rate": 1.4950395995880073e-05,
412
+ "loss": 0.6821,
413
+ "step": 530
414
+ },
415
+ {
416
+ "epoch": 0.37,
417
+ "learning_rate": 1.4454475288563387e-05,
418
+ "loss": 0.6606,
419
+ "step": 540
420
+ },
421
+ {
422
+ "epoch": 0.38,
423
+ "learning_rate": 1.395915110450934e-05,
424
+ "loss": 0.6161,
425
+ "step": 550
426
+ },
427
+ {
428
+ "epoch": 0.38,
429
+ "eval_loss": 0.5955923199653625,
430
+ "eval_runtime": 137.6678,
431
+ "eval_samples_per_second": 4.722,
432
+ "eval_steps_per_second": 1.184,
433
+ "step": 550
434
+ },
435
+ {
436
+ "epoch": 0.38,
437
+ "learning_rate": 1.3464965073440924e-05,
438
+ "loss": 0.6497,
439
+ "step": 560
440
+ },
441
+ {
442
+ "epoch": 0.39,
443
+ "learning_rate": 1.2972457580527551e-05,
444
+ "loss": 0.6619,
445
+ "step": 570
446
+ },
447
+ {
448
+ "epoch": 0.4,
449
+ "learning_rate": 1.2482167175481786e-05,
450
+ "loss": 0.6551,
451
+ "step": 580
452
+ },
453
+ {
454
+ "epoch": 0.4,
455
+ "learning_rate": 1.1994629983663183e-05,
456
+ "loss": 0.6313,
457
+ "step": 590
458
+ },
459
+ {
460
+ "epoch": 0.41,
461
+ "learning_rate": 1.1510379119833048e-05,
462
+ "loss": 0.5999,
463
+ "step": 600
464
+ },
465
+ {
466
+ "epoch": 0.41,
467
+ "eval_loss": 0.5938005447387695,
468
+ "eval_runtime": 137.3859,
469
+ "eval_samples_per_second": 4.731,
470
+ "eval_steps_per_second": 1.186,
471
+ "step": 600
472
+ },
473
+ {
474
+ "epoch": 0.42,
475
+ "learning_rate": 1.1029944105201278e-05,
476
+ "loss": 0.5862,
477
+ "step": 610
478
+ },
479
+ {
480
+ "epoch": 0.42,
481
+ "learning_rate": 1.0553850288402696e-05,
482
+ "loss": 0.572,
483
+ "step": 620
484
+ },
485
+ {
486
+ "epoch": 0.43,
487
+ "learning_rate": 1.0082618271036033e-05,
488
+ "loss": 0.6192,
489
+ "step": 630
490
+ },
491
+ {
492
+ "epoch": 0.44,
493
+ "learning_rate": 9.616763338393728e-06,
494
+ "loss": 0.6318,
495
+ "step": 640
496
+ },
497
+ {
498
+ "epoch": 0.44,
499
+ "learning_rate": 9.156794896005e-06,
500
+ "loss": 0.6248,
501
+ "step": 650
502
+ },
503
+ {
504
+ "epoch": 0.44,
505
+ "eval_loss": 0.5824475884437561,
506
+ "eval_runtime": 137.4611,
507
+ "eval_samples_per_second": 4.729,
508
+ "eval_steps_per_second": 1.186,
509
+ "step": 650
510
+ },
511
+ {
512
+ "epoch": 0.45,
513
+ "learning_rate": 8.703215912608416e-06,
514
+ "loss": 0.6222,
515
+ "step": 660
516
+ },
517
+ {
518
+ "epoch": 0.46,
519
+ "learning_rate": 8.256522370162949e-06,
520
+ "loss": 0.5987,
521
+ "step": 670
522
+ },
523
+ {
524
+ "epoch": 0.47,
525
+ "learning_rate": 7.817202721498955e-06,
526
+ "loss": 0.6221,
527
+ "step": 680
528
+ },
529
+ {
530
+ "epoch": 0.47,
531
+ "learning_rate": 7.385737356202244e-06,
532
+ "loss": 0.5991,
533
+ "step": 690
534
+ },
535
+ {
536
+ "epoch": 0.48,
537
+ "learning_rate": 6.962598075315047e-06,
538
+ "loss": 0.6494,
539
+ "step": 700
540
+ },
541
+ {
542
+ "epoch": 0.48,
543
+ "eval_loss": 0.5806382894515991,
544
+ "eval_runtime": 137.5354,
545
+ "eval_samples_per_second": 4.726,
546
+ "eval_steps_per_second": 1.185,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 0.49,
551
+ "learning_rate": 6.5482475754285535e-06,
552
+ "loss": 0.5733,
553
+ "step": 710
554
+ },
555
+ {
556
+ "epoch": 0.49,
557
+ "learning_rate": 6.143138942730943e-06,
558
+ "loss": 0.6724,
559
+ "step": 720
560
+ },
561
+ {
562
+ "epoch": 0.5,
563
+ "learning_rate": 5.747715157564335e-06,
564
+ "loss": 0.5736,
565
+ "step": 730
566
+ },
567
+ {
568
+ "epoch": 0.51,
569
+ "learning_rate": 5.362408610032257e-06,
570
+ "loss": 0.6059,
571
+ "step": 740
572
+ },
573
+ {
574
+ "epoch": 0.51,
575
+ "learning_rate": 4.987640627187413e-06,
576
+ "loss": 0.6259,
577
+ "step": 750
578
+ },
579
+ {
580
+ "epoch": 0.51,
581
+ "eval_loss": 0.5767314434051514,
582
+ "eval_runtime": 137.4798,
583
+ "eval_samples_per_second": 4.728,
584
+ "eval_steps_per_second": 1.186,
585
+ "step": 750
586
+ },
587
+ {
588
+ "epoch": 0.52,
589
+ "learning_rate": 4.623821012316761e-06,
590
+ "loss": 0.5797,
591
+ "step": 760
592
+ },
593
+ {
594
+ "epoch": 0.53,
595
+ "learning_rate": 4.27134759682762e-06,
596
+ "loss": 0.6227,
597
+ "step": 770
598
+ },
599
+ {
600
+ "epoch": 0.53,
601
+ "learning_rate": 3.930605805224858e-06,
602
+ "loss": 0.6119,
603
+ "step": 780
604
+ },
605
+ {
606
+ "epoch": 0.54,
607
+ "learning_rate": 3.6019682336548736e-06,
608
+ "loss": 0.6289,
609
+ "step": 790
610
+ },
611
+ {
612
+ "epoch": 0.55,
613
+ "learning_rate": 3.285794242477173e-06,
614
+ "loss": 0.557,
615
+ "step": 800
616
+ },
617
+ {
618
+ "epoch": 0.55,
619
+ "eval_loss": 0.5762439966201782,
620
+ "eval_runtime": 137.5294,
621
+ "eval_samples_per_second": 4.726,
622
+ "eval_steps_per_second": 1.185,
623
+ "step": 800
624
+ },
625
+ {
626
+ "epoch": 0.55,
627
+ "learning_rate": 2.9824295633090864e-06,
628
+ "loss": 0.536,
629
+ "step": 810
630
+ },
631
+ {
632
+ "epoch": 0.56,
633
+ "learning_rate": 2.692205920973333e-06,
634
+ "loss": 0.662,
635
+ "step": 820
636
+ },
637
+ {
638
+ "epoch": 0.57,
639
+ "learning_rate": 2.4154406707617813e-06,
640
+ "loss": 0.5556,
641
+ "step": 830
642
+ },
643
+ {
644
+ "epoch": 0.57,
645
+ "learning_rate": 2.1524364514121193e-06,
646
+ "loss": 0.6678,
647
+ "step": 840
648
+ },
649
+ {
650
+ "epoch": 0.58,
651
+ "learning_rate": 1.903480854176805e-06,
652
+ "loss": 0.6215,
653
+ "step": 850
654
+ },
655
+ {
656
+ "epoch": 0.58,
657
+ "eval_loss": 0.5777194499969482,
658
+ "eval_runtime": 137.4572,
659
+ "eval_samples_per_second": 4.729,
660
+ "eval_steps_per_second": 1.186,
661
+ "step": 850
662
+ },
663
+ {
664
+ "epoch": 0.59,
665
+ "learning_rate": 1.6688461083462942e-06,
666
+ "loss": 0.5598,
667
+ "step": 860
668
+ },
669
+ {
670
+ "epoch": 0.6,
671
+ "learning_rate": 1.4487887835702773e-06,
672
+ "loss": 0.6472,
673
+ "step": 870
674
+ },
675
+ {
676
+ "epoch": 0.6,
677
+ "learning_rate": 1.2435495093025523e-06,
678
+ "loss": 0.5749,
679
+ "step": 880
680
+ },
681
+ {
682
+ "epoch": 0.61,
683
+ "learning_rate": 1.0533527116762298e-06,
684
+ "loss": 0.5959,
685
+ "step": 890
686
+ },
687
+ {
688
+ "epoch": 0.62,
689
+ "learning_rate": 8.784063680970788e-07,
690
+ "loss": 0.5986,
691
+ "step": 900
692
+ },
693
+ {
694
+ "epoch": 0.62,
695
+ "eval_loss": 0.5770220160484314,
696
+ "eval_runtime": 137.4371,
697
+ "eval_samples_per_second": 4.729,
698
+ "eval_steps_per_second": 1.186,
699
+ "step": 900
700
+ },
701
+ {
702
+ "epoch": 0.62,
703
+ "learning_rate": 7.189017798232672e-07,
704
+ "loss": 0.5965,
705
+ "step": 910
706
+ },
707
+ {
708
+ "epoch": 0.63,
709
+ "learning_rate": 5.75013362780244e-07,
710
+ "loss": 0.5667,
711
+ "step": 920
712
+ },
713
+ {
714
+ "epoch": 0.64,
715
+ "learning_rate": 4.46898456839504e-07,
716
+ "loss": 0.5984,
717
+ "step": 930
718
+ },
719
+ {
720
+ "epoch": 0.64,
721
+ "learning_rate": 3.346971537697263e-07,
722
+ "loss": 0.6219,
723
+ "step": 940
724
+ },
725
+ {
726
+ "epoch": 0.65,
727
+ "learning_rate": 2.38532144048495e-07,
728
+ "loss": 0.6224,
729
+ "step": 950
730
+ },
731
+ {
732
+ "epoch": 0.65,
733
+ "eval_loss": 0.57674241065979,
734
+ "eval_runtime": 137.5142,
735
+ "eval_samples_per_second": 4.727,
736
+ "eval_steps_per_second": 1.185,
737
+ "step": 950
738
+ },
739
+ {
740
+ "epoch": 0.66,
741
+ "learning_rate": 1.5850858270205992e-07,
742
+ "loss": 0.532,
743
+ "step": 960
744
+ },
745
+ {
746
+ "epoch": 0.66,
747
+ "learning_rate": 9.471397431985884e-08,
748
+ "loss": 0.6332,
749
+ "step": 970
750
+ },
751
+ {
752
+ "epoch": 0.67,
753
+ "learning_rate": 4.721807736953576e-08,
754
+ "loss": 0.5957,
755
+ "step": 980
756
+ },
757
+ {
758
+ "epoch": 0.68,
759
+ "learning_rate": 1.607282791707687e-08,
760
+ "loss": 0.6052,
761
+ "step": 990
762
+ },
763
+ {
764
+ "epoch": 0.68,
765
+ "learning_rate": 1.3122828354905637e-09,
766
+ "loss": 0.6058,
767
+ "step": 1000
768
+ },
769
+ {
770
+ "epoch": 0.68,
771
+ "eval_loss": 0.5767720937728882,
772
+ "eval_runtime": 137.4993,
773
+ "eval_samples_per_second": 4.727,
774
+ "eval_steps_per_second": 1.185,
775
+ "step": 1000
776
+ },
777
+ {
778
+ "epoch": 0.68,
779
+ "step": 1000,
780
+ "total_flos": 9.350756982954394e+16,
781
+ "train_loss": 0.6960326323509216,
782
+ "train_runtime": 5130.9272,
783
+ "train_samples_per_second": 0.78,
784
+ "train_steps_per_second": 0.195
785
+ }
786
+ ],
787
+ "logging_steps": 10,
788
+ "max_steps": 1000,
789
+ "num_train_epochs": 1,
790
+ "save_steps": 500,
791
+ "total_flos": 9.350756982954394e+16,
792
+ "trial_name": null,
793
+ "trial_params": null
794
+ }