nicolasdec commited on
Commit
7d517c3
·
verified ·
1 Parent(s): f980ce5

Delete trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -4006
trainer_state.json DELETED
@@ -1,4006 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 2.999207397622193,
5
- "eval_steps": 500,
6
- "global_step": 2838,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.0,
13
- "grad_norm": 54.60578280011217,
14
- "learning_rate": 3.4482758620689656e-07,
15
- "loss": 2.0852,
16
- "step": 1
17
- },
18
- {
19
- "epoch": 0.01,
20
- "grad_norm": 42.54745106429926,
21
- "learning_rate": 1.724137931034483e-06,
22
- "loss": 2.0381,
23
- "step": 5
24
- },
25
- {
26
- "epoch": 0.01,
27
- "grad_norm": 9.212853479586382,
28
- "learning_rate": 3.448275862068966e-06,
29
- "loss": 1.6632,
30
- "step": 10
31
- },
32
- {
33
- "epoch": 0.02,
34
- "grad_norm": 5.314912275996213,
35
- "learning_rate": 5.172413793103449e-06,
36
- "loss": 1.3468,
37
- "step": 15
38
- },
39
- {
40
- "epoch": 0.02,
41
- "grad_norm": 4.03433890035307,
42
- "learning_rate": 6.896551724137932e-06,
43
- "loss": 1.2538,
44
- "step": 20
45
- },
46
- {
47
- "epoch": 0.03,
48
- "grad_norm": 2.6447411660183016,
49
- "learning_rate": 8.620689655172414e-06,
50
- "loss": 1.2604,
51
- "step": 25
52
- },
53
- {
54
- "epoch": 0.03,
55
- "grad_norm": 2.1857065034640795,
56
- "learning_rate": 9.999996872939885e-06,
57
- "loss": 1.1935,
58
- "step": 30
59
- },
60
- {
61
- "epoch": 0.04,
62
- "grad_norm": 1.7526909326796276,
63
- "learning_rate": 9.999887426246524e-06,
64
- "loss": 1.1939,
65
- "step": 35
66
- },
67
- {
68
- "epoch": 0.04,
69
- "grad_norm": 1.5378704654179458,
70
- "learning_rate": 9.999621630458743e-06,
71
- "loss": 1.1626,
72
- "step": 40
73
- },
74
- {
75
- "epoch": 0.05,
76
- "grad_norm": 1.3071739468687105,
77
- "learning_rate": 9.999199493888118e-06,
78
- "loss": 1.122,
79
- "step": 45
80
- },
81
- {
82
- "epoch": 0.05,
83
- "grad_norm": 1.1982413397341172,
84
- "learning_rate": 9.998621029735082e-06,
85
- "loss": 1.1191,
86
- "step": 50
87
- },
88
- {
89
- "epoch": 0.06,
90
- "grad_norm": 1.0956550608235818,
91
- "learning_rate": 9.997886256088507e-06,
92
- "loss": 1.123,
93
- "step": 55
94
- },
95
- {
96
- "epoch": 0.06,
97
- "grad_norm": 1.0821639142212824,
98
- "learning_rate": 9.996995195925152e-06,
99
- "loss": 1.0751,
100
- "step": 60
101
- },
102
- {
103
- "epoch": 0.07,
104
- "grad_norm": 1.0865870956103822,
105
- "learning_rate": 9.995947877108933e-06,
106
- "loss": 1.114,
107
- "step": 65
108
- },
109
- {
110
- "epoch": 0.07,
111
- "grad_norm": 1.0582468505041362,
112
- "learning_rate": 9.99474433239006e-06,
113
- "loss": 1.0609,
114
- "step": 70
115
- },
116
- {
117
- "epoch": 0.08,
118
- "grad_norm": 1.0556893008132,
119
- "learning_rate": 9.993384599404001e-06,
120
- "loss": 1.0861,
121
- "step": 75
122
- },
123
- {
124
- "epoch": 0.08,
125
- "grad_norm": 1.053104954690895,
126
- "learning_rate": 9.991868720670322e-06,
127
- "loss": 1.0792,
128
- "step": 80
129
- },
130
- {
131
- "epoch": 0.09,
132
- "grad_norm": 1.069931327549148,
133
- "learning_rate": 9.990196743591341e-06,
134
- "loss": 1.0722,
135
- "step": 85
136
- },
137
- {
138
- "epoch": 0.1,
139
- "grad_norm": 1.0793525694046096,
140
- "learning_rate": 9.988368720450656e-06,
141
- "loss": 1.0561,
142
- "step": 90
143
- },
144
- {
145
- "epoch": 0.1,
146
- "grad_norm": 1.0449093375062701,
147
- "learning_rate": 9.986384708411507e-06,
148
- "loss": 1.0675,
149
- "step": 95
150
- },
151
- {
152
- "epoch": 0.11,
153
- "grad_norm": 0.9947694351867048,
154
- "learning_rate": 9.984244769514988e-06,
155
- "loss": 1.0395,
156
- "step": 100
157
- },
158
- {
159
- "epoch": 0.11,
160
- "grad_norm": 1.0281907914775885,
161
- "learning_rate": 9.981948970678107e-06,
162
- "loss": 1.0455,
163
- "step": 105
164
- },
165
- {
166
- "epoch": 0.12,
167
- "grad_norm": 1.0429607059919377,
168
- "learning_rate": 9.979497383691695e-06,
169
- "loss": 1.014,
170
- "step": 110
171
- },
172
- {
173
- "epoch": 0.12,
174
- "grad_norm": 1.1551790238118493,
175
- "learning_rate": 9.976890085218157e-06,
176
- "loss": 1.0335,
177
- "step": 115
178
- },
179
- {
180
- "epoch": 0.13,
181
- "grad_norm": 1.0958017153997939,
182
- "learning_rate": 9.974127156789082e-06,
183
- "loss": 1.0566,
184
- "step": 120
185
- },
186
- {
187
- "epoch": 0.13,
188
- "grad_norm": 1.0545164230640043,
189
- "learning_rate": 9.971208684802686e-06,
190
- "loss": 1.0234,
191
- "step": 125
192
- },
193
- {
194
- "epoch": 0.14,
195
- "grad_norm": 1.1119539198620567,
196
- "learning_rate": 9.968134760521114e-06,
197
- "loss": 0.9956,
198
- "step": 130
199
- },
200
- {
201
- "epoch": 0.14,
202
- "grad_norm": 1.045532676163788,
203
- "learning_rate": 9.964905480067585e-06,
204
- "loss": 1.0103,
205
- "step": 135
206
- },
207
- {
208
- "epoch": 0.15,
209
- "grad_norm": 1.0818099938062198,
210
- "learning_rate": 9.96152094442339e-06,
211
- "loss": 0.987,
212
- "step": 140
213
- },
214
- {
215
- "epoch": 0.15,
216
- "grad_norm": 1.06916616510137,
217
- "learning_rate": 9.957981259424724e-06,
218
- "loss": 1.0189,
219
- "step": 145
220
- },
221
- {
222
- "epoch": 0.16,
223
- "grad_norm": 1.1000812098052206,
224
- "learning_rate": 9.954286535759394e-06,
225
- "loss": 1.0025,
226
- "step": 150
227
- },
228
- {
229
- "epoch": 0.16,
230
- "grad_norm": 1.0740685860653156,
231
- "learning_rate": 9.950436888963337e-06,
232
- "loss": 1.0394,
233
- "step": 155
234
- },
235
- {
236
- "epoch": 0.17,
237
- "grad_norm": 1.0578416601226404,
238
- "learning_rate": 9.946432439417021e-06,
239
- "loss": 1.0419,
240
- "step": 160
241
- },
242
- {
243
- "epoch": 0.17,
244
- "grad_norm": 1.1378367934770748,
245
- "learning_rate": 9.942273312341679e-06,
246
- "loss": 1.04,
247
- "step": 165
248
- },
249
- {
250
- "epoch": 0.18,
251
- "grad_norm": 1.106141894903122,
252
- "learning_rate": 9.937959637795389e-06,
253
- "loss": 1.0112,
254
- "step": 170
255
- },
256
- {
257
- "epoch": 0.18,
258
- "grad_norm": 1.0459501547982482,
259
- "learning_rate": 9.93349155066901e-06,
260
- "loss": 0.9959,
261
- "step": 175
262
- },
263
- {
264
- "epoch": 0.19,
265
- "grad_norm": 1.1420602608538855,
266
- "learning_rate": 9.928869190681964e-06,
267
- "loss": 0.9952,
268
- "step": 180
269
- },
270
- {
271
- "epoch": 0.2,
272
- "grad_norm": 1.0748374838181862,
273
- "learning_rate": 9.924092702377863e-06,
274
- "loss": 1.0094,
275
- "step": 185
276
- },
277
- {
278
- "epoch": 0.2,
279
- "grad_norm": 1.0535011085546289,
280
- "learning_rate": 9.919162235119996e-06,
281
- "loss": 1.0054,
282
- "step": 190
283
- },
284
- {
285
- "epoch": 0.21,
286
- "grad_norm": 1.0310625793824704,
287
- "learning_rate": 9.91407794308665e-06,
288
- "loss": 1.0117,
289
- "step": 195
290
- },
291
- {
292
- "epoch": 0.21,
293
- "grad_norm": 1.0359842004906923,
294
- "learning_rate": 9.908839985266297e-06,
295
- "loss": 0.9982,
296
- "step": 200
297
- },
298
- {
299
- "epoch": 0.22,
300
- "grad_norm": 1.084059570369228,
301
- "learning_rate": 9.903448525452618e-06,
302
- "loss": 1.0127,
303
- "step": 205
304
- },
305
- {
306
- "epoch": 0.22,
307
- "grad_norm": 1.1227120329409497,
308
- "learning_rate": 9.89790373223938e-06,
309
- "loss": 1.048,
310
- "step": 210
311
- },
312
- {
313
- "epoch": 0.23,
314
- "grad_norm": 1.0615021518173307,
315
- "learning_rate": 9.892205779015167e-06,
316
- "loss": 1.0021,
317
- "step": 215
318
- },
319
- {
320
- "epoch": 0.23,
321
- "grad_norm": 1.070318683802529,
322
- "learning_rate": 9.886354843957953e-06,
323
- "loss": 1.0043,
324
- "step": 220
325
- },
326
- {
327
- "epoch": 0.24,
328
- "grad_norm": 1.0419755132096296,
329
- "learning_rate": 9.88035111002954e-06,
330
- "loss": 0.9743,
331
- "step": 225
332
- },
333
- {
334
- "epoch": 0.24,
335
- "grad_norm": 1.1796172322040084,
336
- "learning_rate": 9.874194764969827e-06,
337
- "loss": 0.9957,
338
- "step": 230
339
- },
340
- {
341
- "epoch": 0.25,
342
- "grad_norm": 1.0933963352790785,
343
- "learning_rate": 9.867886001290943e-06,
344
- "loss": 0.9814,
345
- "step": 235
346
- },
347
- {
348
- "epoch": 0.25,
349
- "grad_norm": 1.1066280030775704,
350
- "learning_rate": 9.861425016271227e-06,
351
- "loss": 0.9832,
352
- "step": 240
353
- },
354
- {
355
- "epoch": 0.26,
356
- "grad_norm": 1.1138948008724274,
357
- "learning_rate": 9.854812011949059e-06,
358
- "loss": 0.9871,
359
- "step": 245
360
- },
361
- {
362
- "epoch": 0.26,
363
- "grad_norm": 1.0644401239508805,
364
- "learning_rate": 9.848047195116543e-06,
365
- "loss": 0.9951,
366
- "step": 250
367
- },
368
- {
369
- "epoch": 0.27,
370
- "grad_norm": 1.1880183474724784,
371
- "learning_rate": 9.841130777313039e-06,
372
- "loss": 0.9902,
373
- "step": 255
374
- },
375
- {
376
- "epoch": 0.27,
377
- "grad_norm": 1.0747113009717828,
378
- "learning_rate": 9.834062974818547e-06,
379
- "loss": 0.9433,
380
- "step": 260
381
- },
382
- {
383
- "epoch": 0.28,
384
- "grad_norm": 1.1442114734348945,
385
- "learning_rate": 9.826844008646949e-06,
386
- "loss": 0.9703,
387
- "step": 265
388
- },
389
- {
390
- "epoch": 0.29,
391
- "grad_norm": 1.0895758630826766,
392
- "learning_rate": 9.81947410453909e-06,
393
- "loss": 1.0236,
394
- "step": 270
395
- },
396
- {
397
- "epoch": 0.29,
398
- "grad_norm": 0.996075250542336,
399
- "learning_rate": 9.811953492955728e-06,
400
- "loss": 0.9577,
401
- "step": 275
402
- },
403
- {
404
- "epoch": 0.3,
405
- "grad_norm": 1.1734623195649692,
406
- "learning_rate": 9.80428240907032e-06,
407
- "loss": 0.9752,
408
- "step": 280
409
- },
410
- {
411
- "epoch": 0.3,
412
- "grad_norm": 1.282701051609298,
413
- "learning_rate": 9.796461092761668e-06,
414
- "loss": 0.987,
415
- "step": 285
416
- },
417
- {
418
- "epoch": 0.31,
419
- "grad_norm": 1.0721992980205135,
420
- "learning_rate": 9.788489788606423e-06,
421
- "loss": 0.944,
422
- "step": 290
423
- },
424
- {
425
- "epoch": 0.31,
426
- "grad_norm": 1.105694230535082,
427
- "learning_rate": 9.780368745871438e-06,
428
- "loss": 0.9804,
429
- "step": 295
430
- },
431
- {
432
- "epoch": 0.32,
433
- "grad_norm": 1.1121587653939105,
434
- "learning_rate": 9.772098218505963e-06,
435
- "loss": 1.0099,
436
- "step": 300
437
- },
438
- {
439
- "epoch": 0.32,
440
- "grad_norm": 1.1073177873687883,
441
- "learning_rate": 9.763678465133712e-06,
442
- "loss": 0.9887,
443
- "step": 305
444
- },
445
- {
446
- "epoch": 0.33,
447
- "grad_norm": 1.1986141459298305,
448
- "learning_rate": 9.755109749044781e-06,
449
- "loss": 0.9749,
450
- "step": 310
451
- },
452
- {
453
- "epoch": 0.33,
454
- "grad_norm": 1.0864391212895972,
455
- "learning_rate": 9.7463923381874e-06,
456
- "loss": 0.9767,
457
- "step": 315
458
- },
459
- {
460
- "epoch": 0.34,
461
- "grad_norm": 1.0595953209575595,
462
- "learning_rate": 9.737526505159564e-06,
463
- "loss": 0.9297,
464
- "step": 320
465
- },
466
- {
467
- "epoch": 0.34,
468
- "grad_norm": 1.083224438455533,
469
- "learning_rate": 9.728512527200509e-06,
470
- "loss": 0.9498,
471
- "step": 325
472
- },
473
- {
474
- "epoch": 0.35,
475
- "grad_norm": 1.1306776282190978,
476
- "learning_rate": 9.719350686182041e-06,
477
- "loss": 0.982,
478
- "step": 330
479
- },
480
- {
481
- "epoch": 0.35,
482
- "grad_norm": 1.07939319367538,
483
- "learning_rate": 9.710041268599718e-06,
484
- "loss": 0.9669,
485
- "step": 335
486
- },
487
- {
488
- "epoch": 0.36,
489
- "grad_norm": 1.1100410279851476,
490
- "learning_rate": 9.700584565563897e-06,
491
- "loss": 0.956,
492
- "step": 340
493
- },
494
- {
495
- "epoch": 0.36,
496
- "grad_norm": 1.0917533373255544,
497
- "learning_rate": 9.690980872790627e-06,
498
- "loss": 0.9878,
499
- "step": 345
500
- },
501
- {
502
- "epoch": 0.37,
503
- "grad_norm": 1.1287494016251205,
504
- "learning_rate": 9.681230490592403e-06,
505
- "loss": 0.9604,
506
- "step": 350
507
- },
508
- {
509
- "epoch": 0.38,
510
- "grad_norm": 1.0366025693971206,
511
- "learning_rate": 9.671333723868773e-06,
512
- "loss": 0.9809,
513
- "step": 355
514
- },
515
- {
516
- "epoch": 0.38,
517
- "grad_norm": 1.1876939558601538,
518
- "learning_rate": 9.66129088209681e-06,
519
- "loss": 0.9324,
520
- "step": 360
521
- },
522
- {
523
- "epoch": 0.39,
524
- "grad_norm": 1.1296469706806582,
525
- "learning_rate": 9.651102279321429e-06,
526
- "loss": 0.98,
527
- "step": 365
528
- },
529
- {
530
- "epoch": 0.39,
531
- "grad_norm": 1.0920615981549329,
532
- "learning_rate": 9.640768234145563e-06,
533
- "loss": 0.9474,
534
- "step": 370
535
- },
536
- {
537
- "epoch": 0.4,
538
- "grad_norm": 1.045353192143218,
539
- "learning_rate": 9.630289069720213e-06,
540
- "loss": 0.9416,
541
- "step": 375
542
- },
543
- {
544
- "epoch": 0.4,
545
- "grad_norm": 1.0546831730532094,
546
- "learning_rate": 9.619665113734327e-06,
547
- "loss": 0.9583,
548
- "step": 380
549
- },
550
- {
551
- "epoch": 0.41,
552
- "grad_norm": 1.120397617115956,
553
- "learning_rate": 9.608896698404567e-06,
554
- "loss": 0.9739,
555
- "step": 385
556
- },
557
- {
558
- "epoch": 0.41,
559
- "grad_norm": 1.0897789727469696,
560
- "learning_rate": 9.597984160464908e-06,
561
- "loss": 0.9882,
562
- "step": 390
563
- },
564
- {
565
- "epoch": 0.42,
566
- "grad_norm": 1.0655227440534312,
567
- "learning_rate": 9.586927841156121e-06,
568
- "loss": 0.973,
569
- "step": 395
570
- },
571
- {
572
- "epoch": 0.42,
573
- "grad_norm": 1.024445190271631,
574
- "learning_rate": 9.575728086215093e-06,
575
- "loss": 0.9488,
576
- "step": 400
577
- },
578
- {
579
- "epoch": 0.43,
580
- "grad_norm": 1.0957551302719917,
581
- "learning_rate": 9.564385245864015e-06,
582
- "loss": 0.9395,
583
- "step": 405
584
- },
585
- {
586
- "epoch": 0.43,
587
- "grad_norm": 1.0348921383964815,
588
- "learning_rate": 9.552899674799438e-06,
589
- "loss": 0.9618,
590
- "step": 410
591
- },
592
- {
593
- "epoch": 0.44,
594
- "grad_norm": 1.1320917241343242,
595
- "learning_rate": 9.541271732181174e-06,
596
- "loss": 0.9737,
597
- "step": 415
598
- },
599
- {
600
- "epoch": 0.44,
601
- "grad_norm": 1.0955620287950987,
602
- "learning_rate": 9.52950178162107e-06,
603
- "loss": 0.9765,
604
- "step": 420
605
- },
606
- {
607
- "epoch": 0.45,
608
- "grad_norm": 1.0865957472837047,
609
- "learning_rate": 9.517590191171638e-06,
610
- "loss": 0.9402,
611
- "step": 425
612
- },
613
- {
614
- "epoch": 0.45,
615
- "grad_norm": 1.0608004961340336,
616
- "learning_rate": 9.505537333314534e-06,
617
- "loss": 0.938,
618
- "step": 430
619
- },
620
- {
621
- "epoch": 0.46,
622
- "grad_norm": 1.0436288259170787,
623
- "learning_rate": 9.493343584948931e-06,
624
- "loss": 0.9495,
625
- "step": 435
626
- },
627
- {
628
- "epoch": 0.46,
629
- "grad_norm": 1.0827000850655668,
630
- "learning_rate": 9.481009327379714e-06,
631
- "loss": 0.9505,
632
- "step": 440
633
- },
634
- {
635
- "epoch": 0.47,
636
- "grad_norm": 1.0958366892000795,
637
- "learning_rate": 9.46853494630557e-06,
638
- "loss": 0.9536,
639
- "step": 445
640
- },
641
- {
642
- "epoch": 0.48,
643
- "grad_norm": 1.0431220913897328,
644
- "learning_rate": 9.455920831806917e-06,
645
- "loss": 0.942,
646
- "step": 450
647
- },
648
- {
649
- "epoch": 0.48,
650
- "grad_norm": 1.1372655798293543,
651
- "learning_rate": 9.443167378333711e-06,
652
- "loss": 0.9447,
653
- "step": 455
654
- },
655
- {
656
- "epoch": 0.49,
657
- "grad_norm": 1.0890187843066097,
658
- "learning_rate": 9.43027498469311e-06,
659
- "loss": 0.9291,
660
- "step": 460
661
- },
662
- {
663
- "epoch": 0.49,
664
- "grad_norm": 1.128255566030822,
665
- "learning_rate": 9.41724405403701e-06,
666
- "loss": 0.9418,
667
- "step": 465
668
- },
669
- {
670
- "epoch": 0.5,
671
- "grad_norm": 1.0200134644324146,
672
- "learning_rate": 9.404074993849421e-06,
673
- "loss": 0.927,
674
- "step": 470
675
- },
676
- {
677
- "epoch": 0.5,
678
- "grad_norm": 1.0912622433950008,
679
- "learning_rate": 9.390768215933746e-06,
680
- "loss": 0.943,
681
- "step": 475
682
- },
683
- {
684
- "epoch": 0.51,
685
- "grad_norm": 1.1784430852167105,
686
- "learning_rate": 9.377324136399887e-06,
687
- "loss": 0.9409,
688
- "step": 480
689
- },
690
- {
691
- "epoch": 0.51,
692
- "grad_norm": 1.0732445497397998,
693
- "learning_rate": 9.36374317565124e-06,
694
- "loss": 0.9401,
695
- "step": 485
696
- },
697
- {
698
- "epoch": 0.52,
699
- "grad_norm": 1.1241973380928443,
700
- "learning_rate": 9.350025758371554e-06,
701
- "loss": 0.9188,
702
- "step": 490
703
- },
704
- {
705
- "epoch": 0.52,
706
- "grad_norm": 1.0680249447424572,
707
- "learning_rate": 9.336172313511636e-06,
708
- "loss": 0.9304,
709
- "step": 495
710
- },
711
- {
712
- "epoch": 0.53,
713
- "grad_norm": 1.0400938648362148,
714
- "learning_rate": 9.322183274275954e-06,
715
- "loss": 0.9465,
716
- "step": 500
717
- },
718
- {
719
- "epoch": 0.53,
720
- "grad_norm": 1.1484166178621282,
721
- "learning_rate": 9.308059078109078e-06,
722
- "loss": 0.9431,
723
- "step": 505
724
- },
725
- {
726
- "epoch": 0.54,
727
- "grad_norm": 1.0928763685485705,
728
- "learning_rate": 9.29380016668201e-06,
729
- "loss": 0.9368,
730
- "step": 510
731
- },
732
- {
733
- "epoch": 0.54,
734
- "grad_norm": 1.0470334802413224,
735
- "learning_rate": 9.279406985878367e-06,
736
- "loss": 0.9529,
737
- "step": 515
738
- },
739
- {
740
- "epoch": 0.55,
741
- "grad_norm": 1.055693577627048,
742
- "learning_rate": 9.264879985780436e-06,
743
- "loss": 0.9237,
744
- "step": 520
745
- },
746
- {
747
- "epoch": 0.55,
748
- "grad_norm": 1.0582407523485609,
749
- "learning_rate": 9.250219620655112e-06,
750
- "loss": 0.9455,
751
- "step": 525
752
- },
753
- {
754
- "epoch": 0.56,
755
- "grad_norm": 1.0392740863841614,
756
- "learning_rate": 9.235426348939674e-06,
757
- "loss": 0.9866,
758
- "step": 530
759
- },
760
- {
761
- "epoch": 0.57,
762
- "grad_norm": 1.087021743413759,
763
- "learning_rate": 9.220500633227467e-06,
764
- "loss": 0.9797,
765
- "step": 535
766
- },
767
- {
768
- "epoch": 0.57,
769
- "grad_norm": 1.0905659766649087,
770
- "learning_rate": 9.205442940253426e-06,
771
- "loss": 0.9231,
772
- "step": 540
773
- },
774
- {
775
- "epoch": 0.58,
776
- "grad_norm": 1.0838061353931883,
777
- "learning_rate": 9.190253740879484e-06,
778
- "loss": 0.9155,
779
- "step": 545
780
- },
781
- {
782
- "epoch": 0.58,
783
- "grad_norm": 1.1721559515157844,
784
- "learning_rate": 9.174933510079847e-06,
785
- "loss": 0.9132,
786
- "step": 550
787
- },
788
- {
789
- "epoch": 0.59,
790
- "grad_norm": 1.0711291424853389,
791
- "learning_rate": 9.159482726926147e-06,
792
- "loss": 0.9368,
793
- "step": 555
794
- },
795
- {
796
- "epoch": 0.59,
797
- "grad_norm": 1.0906836737125443,
798
- "learning_rate": 9.14390187457245e-06,
799
- "loss": 0.9652,
800
- "step": 560
801
- },
802
- {
803
- "epoch": 0.6,
804
- "grad_norm": 1.2147816750505283,
805
- "learning_rate": 9.128191440240159e-06,
806
- "loss": 0.922,
807
- "step": 565
808
- },
809
- {
810
- "epoch": 0.6,
811
- "grad_norm": 1.0745698856829782,
812
- "learning_rate": 9.11235191520277e-06,
813
- "loss": 0.9267,
814
- "step": 570
815
- },
816
- {
817
- "epoch": 0.61,
818
- "grad_norm": 1.1107563079565528,
819
- "learning_rate": 9.096383794770513e-06,
820
- "loss": 0.9403,
821
- "step": 575
822
- },
823
- {
824
- "epoch": 0.61,
825
- "grad_norm": 1.0645734678937102,
826
- "learning_rate": 9.080287578274866e-06,
827
- "loss": 0.9149,
828
- "step": 580
829
- },
830
- {
831
- "epoch": 0.62,
832
- "grad_norm": 1.1729380707889032,
833
- "learning_rate": 9.064063769052933e-06,
834
- "loss": 0.9236,
835
- "step": 585
836
- },
837
- {
838
- "epoch": 0.62,
839
- "grad_norm": 1.0634029251400858,
840
- "learning_rate": 9.047712874431716e-06,
841
- "loss": 0.9264,
842
- "step": 590
843
- },
844
- {
845
- "epoch": 0.63,
846
- "grad_norm": 1.185148731024843,
847
- "learning_rate": 9.031235405712239e-06,
848
- "loss": 0.9632,
849
- "step": 595
850
- },
851
- {
852
- "epoch": 0.63,
853
- "grad_norm": 1.1238661801404854,
854
- "learning_rate": 9.014631878153564e-06,
855
- "loss": 0.9364,
856
- "step": 600
857
- },
858
- {
859
- "epoch": 0.64,
860
- "grad_norm": 1.1101591200426506,
861
- "learning_rate": 8.997902810956682e-06,
862
- "loss": 0.9121,
863
- "step": 605
864
- },
865
- {
866
- "epoch": 0.64,
867
- "grad_norm": 1.1328306862765927,
868
- "learning_rate": 8.98104872724827e-06,
869
- "loss": 0.9637,
870
- "step": 610
871
- },
872
- {
873
- "epoch": 0.65,
874
- "grad_norm": 1.1182389860600772,
875
- "learning_rate": 8.964070154064343e-06,
876
- "loss": 0.9431,
877
- "step": 615
878
- },
879
- {
880
- "epoch": 0.66,
881
- "grad_norm": 1.2315329373588069,
882
- "learning_rate": 8.94696762233376e-06,
883
- "loss": 0.9261,
884
- "step": 620
885
- },
886
- {
887
- "epoch": 0.66,
888
- "grad_norm": 1.0785263989248792,
889
- "learning_rate": 8.92974166686163e-06,
890
- "loss": 0.9218,
891
- "step": 625
892
- },
893
- {
894
- "epoch": 0.67,
895
- "grad_norm": 1.0293877329539916,
896
- "learning_rate": 8.912392826312595e-06,
897
- "loss": 0.9516,
898
- "step": 630
899
- },
900
- {
901
- "epoch": 0.67,
902
- "grad_norm": 1.0797961930582287,
903
- "learning_rate": 8.894921643193966e-06,
904
- "loss": 0.94,
905
- "step": 635
906
- },
907
- {
908
- "epoch": 0.68,
909
- "grad_norm": 1.0052477432214972,
910
- "learning_rate": 8.877328663838776e-06,
911
- "loss": 0.9207,
912
- "step": 640
913
- },
914
- {
915
- "epoch": 0.68,
916
- "grad_norm": 1.0126272743426095,
917
- "learning_rate": 8.85961443838869e-06,
918
- "loss": 0.9292,
919
- "step": 645
920
- },
921
- {
922
- "epoch": 0.69,
923
- "grad_norm": 1.0166858946265631,
924
- "learning_rate": 8.841779520776803e-06,
925
- "loss": 0.9171,
926
- "step": 650
927
- },
928
- {
929
- "epoch": 0.69,
930
- "grad_norm": 1.0674058891203713,
931
- "learning_rate": 8.823824468710312e-06,
932
- "loss": 0.9238,
933
- "step": 655
934
- },
935
- {
936
- "epoch": 0.7,
937
- "grad_norm": 1.0826543746678357,
938
- "learning_rate": 8.805749843653086e-06,
939
- "loss": 0.8903,
940
- "step": 660
941
- },
942
- {
943
- "epoch": 0.7,
944
- "grad_norm": 1.0474293060948185,
945
- "learning_rate": 8.787556210808101e-06,
946
- "loss": 0.8952,
947
- "step": 665
948
- },
949
- {
950
- "epoch": 0.71,
951
- "grad_norm": 1.1092322508696293,
952
- "learning_rate": 8.769244139099774e-06,
953
- "loss": 0.9191,
954
- "step": 670
955
- },
956
- {
957
- "epoch": 0.71,
958
- "grad_norm": 1.0453618423472522,
959
- "learning_rate": 8.750814201156157e-06,
960
- "loss": 0.9287,
961
- "step": 675
962
- },
963
- {
964
- "epoch": 0.72,
965
- "grad_norm": 1.0150902528617922,
966
- "learning_rate": 8.732266973291053e-06,
967
- "loss": 0.9005,
968
- "step": 680
969
- },
970
- {
971
- "epoch": 0.72,
972
- "grad_norm": 1.111573072134849,
973
- "learning_rate": 8.713603035485972e-06,
974
- "loss": 0.9061,
975
- "step": 685
976
- },
977
- {
978
- "epoch": 0.73,
979
- "grad_norm": 1.0266552996471214,
980
- "learning_rate": 8.694822971372012e-06,
981
- "loss": 0.8981,
982
- "step": 690
983
- },
984
- {
985
- "epoch": 0.73,
986
- "grad_norm": 1.026959416886306,
987
- "learning_rate": 8.675927368211599e-06,
988
- "loss": 0.9119,
989
- "step": 695
990
- },
991
- {
992
- "epoch": 0.74,
993
- "grad_norm": 0.990879098356618,
994
- "learning_rate": 8.656916816880122e-06,
995
- "loss": 0.934,
996
- "step": 700
997
- },
998
- {
999
- "epoch": 0.75,
1000
- "grad_norm": 1.016936193517629,
1001
- "learning_rate": 8.637791911847462e-06,
1002
- "loss": 0.9031,
1003
- "step": 705
1004
- },
1005
- {
1006
- "epoch": 0.75,
1007
- "grad_norm": 1.0105346034407392,
1008
- "learning_rate": 8.618553251159405e-06,
1009
- "loss": 0.8918,
1010
- "step": 710
1011
- },
1012
- {
1013
- "epoch": 0.76,
1014
- "grad_norm": 1.0219526658502593,
1015
- "learning_rate": 8.599201436418927e-06,
1016
- "loss": 0.9202,
1017
- "step": 715
1018
- },
1019
- {
1020
- "epoch": 0.76,
1021
- "grad_norm": 1.0611008297726183,
1022
- "learning_rate": 8.579737072767396e-06,
1023
- "loss": 0.8956,
1024
- "step": 720
1025
- },
1026
- {
1027
- "epoch": 0.77,
1028
- "grad_norm": 1.0532525094762688,
1029
- "learning_rate": 8.560160768865642e-06,
1030
- "loss": 0.8782,
1031
- "step": 725
1032
- },
1033
- {
1034
- "epoch": 0.77,
1035
- "grad_norm": 1.0472370063073,
1036
- "learning_rate": 8.540473136874926e-06,
1037
- "loss": 0.9215,
1038
- "step": 730
1039
- },
1040
- {
1041
- "epoch": 0.78,
1042
- "grad_norm": 1.0503901600633805,
1043
- "learning_rate": 8.520674792437793e-06,
1044
- "loss": 0.905,
1045
- "step": 735
1046
- },
1047
- {
1048
- "epoch": 0.78,
1049
- "grad_norm": 1.0699401745712223,
1050
- "learning_rate": 8.50076635465883e-06,
1051
- "loss": 0.8914,
1052
- "step": 740
1053
- },
1054
- {
1055
- "epoch": 0.79,
1056
- "grad_norm": 1.1604934245734189,
1057
- "learning_rate": 8.480748446085293e-06,
1058
- "loss": 0.923,
1059
- "step": 745
1060
- },
1061
- {
1062
- "epoch": 0.79,
1063
- "grad_norm": 1.0575469862405844,
1064
- "learning_rate": 8.460621692687656e-06,
1065
- "loss": 0.91,
1066
- "step": 750
1067
- },
1068
- {
1069
- "epoch": 0.8,
1070
- "grad_norm": 1.1861862918344839,
1071
- "learning_rate": 8.44038672384002e-06,
1072
- "loss": 0.9183,
1073
- "step": 755
1074
- },
1075
- {
1076
- "epoch": 0.8,
1077
- "grad_norm": 1.0866238920331526,
1078
- "learning_rate": 8.420044172300443e-06,
1079
- "loss": 0.9012,
1080
- "step": 760
1081
- },
1082
- {
1083
- "epoch": 0.81,
1084
- "grad_norm": 1.0963030089254635,
1085
- "learning_rate": 8.399594674191147e-06,
1086
- "loss": 0.8867,
1087
- "step": 765
1088
- },
1089
- {
1090
- "epoch": 0.81,
1091
- "grad_norm": 1.0516263694748806,
1092
- "learning_rate": 8.379038868978635e-06,
1093
- "loss": 0.9204,
1094
- "step": 770
1095
- },
1096
- {
1097
- "epoch": 0.82,
1098
- "grad_norm": 1.0602404388082067,
1099
- "learning_rate": 8.358377399453684e-06,
1100
- "loss": 0.8975,
1101
- "step": 775
1102
- },
1103
- {
1104
- "epoch": 0.82,
1105
- "grad_norm": 1.0524212623827451,
1106
- "learning_rate": 8.337610911711248e-06,
1107
- "loss": 0.9182,
1108
- "step": 780
1109
- },
1110
- {
1111
- "epoch": 0.83,
1112
- "grad_norm": 1.0486851629524967,
1113
- "learning_rate": 8.316740055130263e-06,
1114
- "loss": 0.8996,
1115
- "step": 785
1116
- },
1117
- {
1118
- "epoch": 0.83,
1119
- "grad_norm": 1.0382393662171674,
1120
- "learning_rate": 8.295765482353326e-06,
1121
- "loss": 0.8898,
1122
- "step": 790
1123
- },
1124
- {
1125
- "epoch": 0.84,
1126
- "grad_norm": 1.0801053233779676,
1127
- "learning_rate": 8.274687849266295e-06,
1128
- "loss": 0.8942,
1129
- "step": 795
1130
- },
1131
- {
1132
- "epoch": 0.85,
1133
- "grad_norm": 1.082914632918619,
1134
- "learning_rate": 8.253507814977779e-06,
1135
- "loss": 0.9335,
1136
- "step": 800
1137
- },
1138
- {
1139
- "epoch": 0.85,
1140
- "grad_norm": 1.115797305584172,
1141
- "learning_rate": 8.232226041798528e-06,
1142
- "loss": 0.8733,
1143
- "step": 805
1144
- },
1145
- {
1146
- "epoch": 0.86,
1147
- "grad_norm": 1.0758274816242523,
1148
- "learning_rate": 8.210843195220717e-06,
1149
- "loss": 0.9121,
1150
- "step": 810
1151
- },
1152
- {
1153
- "epoch": 0.86,
1154
- "grad_norm": 0.9966437564306923,
1155
- "learning_rate": 8.189359943897137e-06,
1156
- "loss": 0.9126,
1157
- "step": 815
1158
- },
1159
- {
1160
- "epoch": 0.87,
1161
- "grad_norm": 1.1254388184304862,
1162
- "learning_rate": 8.167776959620298e-06,
1163
- "loss": 0.9113,
1164
- "step": 820
1165
- },
1166
- {
1167
- "epoch": 0.87,
1168
- "grad_norm": 1.033615919920944,
1169
- "learning_rate": 8.1460949173014e-06,
1170
- "loss": 0.8863,
1171
- "step": 825
1172
- },
1173
- {
1174
- "epoch": 0.88,
1175
- "grad_norm": 1.0126421627367477,
1176
- "learning_rate": 8.124314494949247e-06,
1177
- "loss": 0.9044,
1178
- "step": 830
1179
- },
1180
- {
1181
- "epoch": 0.88,
1182
- "grad_norm": 1.0545539629522227,
1183
- "learning_rate": 8.102436373649029e-06,
1184
- "loss": 0.8942,
1185
- "step": 835
1186
- },
1187
- {
1188
- "epoch": 0.89,
1189
- "grad_norm": 1.004956283976033,
1190
- "learning_rate": 8.080461237541049e-06,
1191
- "loss": 0.9255,
1192
- "step": 840
1193
- },
1194
- {
1195
- "epoch": 0.89,
1196
- "grad_norm": 1.0862660155528163,
1197
- "learning_rate": 8.0583897737993e-06,
1198
- "loss": 0.9275,
1199
- "step": 845
1200
- },
1201
- {
1202
- "epoch": 0.9,
1203
- "grad_norm": 1.0697124134441602,
1204
- "learning_rate": 8.036222672609994e-06,
1205
- "loss": 0.9161,
1206
- "step": 850
1207
- },
1208
- {
1209
- "epoch": 0.9,
1210
- "grad_norm": 1.0639070724236763,
1211
- "learning_rate": 8.013960627149981e-06,
1212
- "loss": 0.8874,
1213
- "step": 855
1214
- },
1215
- {
1216
- "epoch": 0.91,
1217
- "grad_norm": 1.166900094582672,
1218
- "learning_rate": 7.991604333565062e-06,
1219
- "loss": 0.8897,
1220
- "step": 860
1221
- },
1222
- {
1223
- "epoch": 0.91,
1224
- "grad_norm": 1.1335592965754175,
1225
- "learning_rate": 7.969154490948225e-06,
1226
- "loss": 0.8964,
1227
- "step": 865
1228
- },
1229
- {
1230
- "epoch": 0.92,
1231
- "grad_norm": 1.0520381511921073,
1232
- "learning_rate": 7.946611801317794e-06,
1233
- "loss": 0.8736,
1234
- "step": 870
1235
- },
1236
- {
1237
- "epoch": 0.92,
1238
- "grad_norm": 1.16753848747216,
1239
- "learning_rate": 7.923976969595459e-06,
1240
- "loss": 0.9112,
1241
- "step": 875
1242
- },
1243
- {
1244
- "epoch": 0.93,
1245
- "grad_norm": 1.0772133099773151,
1246
- "learning_rate": 7.901250703584245e-06,
1247
- "loss": 0.9155,
1248
- "step": 880
1249
- },
1250
- {
1251
- "epoch": 0.94,
1252
- "grad_norm": 1.1464686627860388,
1253
- "learning_rate": 7.878433713946373e-06,
1254
- "loss": 0.8962,
1255
- "step": 885
1256
- },
1257
- {
1258
- "epoch": 0.94,
1259
- "grad_norm": 1.0835779136854178,
1260
- "learning_rate": 7.855526714181041e-06,
1261
- "loss": 0.9058,
1262
- "step": 890
1263
- },
1264
- {
1265
- "epoch": 0.95,
1266
- "grad_norm": 1.171366478493349,
1267
- "learning_rate": 7.832530420602113e-06,
1268
- "loss": 0.8756,
1269
- "step": 895
1270
- },
1271
- {
1272
- "epoch": 0.95,
1273
- "grad_norm": 1.040168900901505,
1274
- "learning_rate": 7.809445552315714e-06,
1275
- "loss": 0.8594,
1276
- "step": 900
1277
- },
1278
- {
1279
- "epoch": 0.96,
1280
- "grad_norm": 1.02166560480321,
1281
- "learning_rate": 7.786272831197745e-06,
1282
- "loss": 0.8935,
1283
- "step": 905
1284
- },
1285
- {
1286
- "epoch": 0.96,
1287
- "grad_norm": 1.1107392454183416,
1288
- "learning_rate": 7.763012981871314e-06,
1289
- "loss": 0.904,
1290
- "step": 910
1291
- },
1292
- {
1293
- "epoch": 0.97,
1294
- "grad_norm": 0.9896358057101541,
1295
- "learning_rate": 7.739666731684073e-06,
1296
- "loss": 0.9068,
1297
- "step": 915
1298
- },
1299
- {
1300
- "epoch": 0.97,
1301
- "grad_norm": 0.9788741930391702,
1302
- "learning_rate": 7.716234810685476e-06,
1303
- "loss": 0.8846,
1304
- "step": 920
1305
- },
1306
- {
1307
- "epoch": 0.98,
1308
- "grad_norm": 0.9931045191442167,
1309
- "learning_rate": 7.692717951603942e-06,
1310
- "loss": 0.8584,
1311
- "step": 925
1312
- },
1313
- {
1314
- "epoch": 0.98,
1315
- "grad_norm": 1.0645481368236074,
1316
- "learning_rate": 7.669116889823955e-06,
1317
- "loss": 0.8992,
1318
- "step": 930
1319
- },
1320
- {
1321
- "epoch": 0.99,
1322
- "grad_norm": 0.9816731950451545,
1323
- "learning_rate": 7.645432363363057e-06,
1324
- "loss": 0.8851,
1325
- "step": 935
1326
- },
1327
- {
1328
- "epoch": 0.99,
1329
- "grad_norm": 0.9899142833993008,
1330
- "learning_rate": 7.621665112848776e-06,
1331
- "loss": 0.8845,
1332
- "step": 940
1333
- },
1334
- {
1335
- "epoch": 1.0,
1336
- "grad_norm": 1.0638888300871174,
1337
- "learning_rate": 7.597815881495465e-06,
1338
- "loss": 0.8773,
1339
- "step": 945
1340
- },
1341
- {
1342
- "epoch": 1.0,
1343
- "grad_norm": 1.031662431521578,
1344
- "learning_rate": 7.573885415081059e-06,
1345
- "loss": 0.8258,
1346
- "step": 950
1347
- },
1348
- {
1349
- "epoch": 1.01,
1350
- "grad_norm": 1.040426497974828,
1351
- "learning_rate": 7.54987446192376e-06,
1352
- "loss": 0.7907,
1353
- "step": 955
1354
- },
1355
- {
1356
- "epoch": 1.01,
1357
- "grad_norm": 0.9887566903005512,
1358
- "learning_rate": 7.525783772858624e-06,
1359
- "loss": 0.8091,
1360
- "step": 960
1361
- },
1362
- {
1363
- "epoch": 1.02,
1364
- "grad_norm": 1.0542179478307365,
1365
- "learning_rate": 7.5016141012141e-06,
1366
- "loss": 0.7815,
1367
- "step": 965
1368
- },
1369
- {
1370
- "epoch": 1.03,
1371
- "grad_norm": 1.0738731959256824,
1372
- "learning_rate": 7.477366202788456e-06,
1373
- "loss": 0.7734,
1374
- "step": 970
1375
- },
1376
- {
1377
- "epoch": 1.03,
1378
- "grad_norm": 0.9975806760235982,
1379
- "learning_rate": 7.45304083582616e-06,
1380
- "loss": 0.7824,
1381
- "step": 975
1382
- },
1383
- {
1384
- "epoch": 1.04,
1385
- "grad_norm": 1.005274019925314,
1386
- "learning_rate": 7.4286387609941544e-06,
1387
- "loss": 0.769,
1388
- "step": 980
1389
- },
1390
- {
1391
- "epoch": 1.04,
1392
- "grad_norm": 1.0937329481520819,
1393
- "learning_rate": 7.40416074135808e-06,
1394
- "loss": 0.791,
1395
- "step": 985
1396
- },
1397
- {
1398
- "epoch": 1.05,
1399
- "grad_norm": 0.9987999174071854,
1400
- "learning_rate": 7.379607542358414e-06,
1401
- "loss": 0.7983,
1402
- "step": 990
1403
- },
1404
- {
1405
- "epoch": 1.05,
1406
- "grad_norm": 1.074721973505265,
1407
- "learning_rate": 7.3549799317865235e-06,
1408
- "loss": 0.8264,
1409
- "step": 995
1410
- },
1411
- {
1412
- "epoch": 1.06,
1413
- "grad_norm": 1.0023766389640552,
1414
- "learning_rate": 7.330278679760673e-06,
1415
- "loss": 0.8166,
1416
- "step": 1000
1417
- },
1418
- {
1419
- "epoch": 1.06,
1420
- "grad_norm": 1.0263488491446793,
1421
- "learning_rate": 7.3055045587019315e-06,
1422
- "loss": 0.7756,
1423
- "step": 1005
1424
- },
1425
- {
1426
- "epoch": 1.07,
1427
- "grad_norm": 1.222252310199244,
1428
- "learning_rate": 7.280658343310016e-06,
1429
- "loss": 0.8113,
1430
- "step": 1010
1431
- },
1432
- {
1433
- "epoch": 1.07,
1434
- "grad_norm": 1.0803171037496995,
1435
- "learning_rate": 7.255740810539078e-06,
1436
- "loss": 0.7773,
1437
- "step": 1015
1438
- },
1439
- {
1440
- "epoch": 1.08,
1441
- "grad_norm": 1.0429385720996782,
1442
- "learning_rate": 7.230752739573398e-06,
1443
- "loss": 0.7959,
1444
- "step": 1020
1445
- },
1446
- {
1447
- "epoch": 1.08,
1448
- "grad_norm": 1.0525788357504489,
1449
- "learning_rate": 7.205694911803019e-06,
1450
- "loss": 0.7962,
1451
- "step": 1025
1452
- },
1453
- {
1454
- "epoch": 1.09,
1455
- "grad_norm": 0.986228023483833,
1456
- "learning_rate": 7.18056811079932e-06,
1457
- "loss": 0.79,
1458
- "step": 1030
1459
- },
1460
- {
1461
- "epoch": 1.09,
1462
- "grad_norm": 1.031179895714868,
1463
- "learning_rate": 7.155373122290508e-06,
1464
- "loss": 0.8101,
1465
- "step": 1035
1466
- },
1467
- {
1468
- "epoch": 1.1,
1469
- "grad_norm": 1.0379629517770603,
1470
- "learning_rate": 7.13011073413705e-06,
1471
- "loss": 0.781,
1472
- "step": 1040
1473
- },
1474
- {
1475
- "epoch": 1.1,
1476
- "grad_norm": 1.033153108919124,
1477
- "learning_rate": 7.1047817363070325e-06,
1478
- "loss": 0.8418,
1479
- "step": 1045
1480
- },
1481
- {
1482
- "epoch": 1.11,
1483
- "grad_norm": 1.0357203376239867,
1484
- "learning_rate": 7.079386920851466e-06,
1485
- "loss": 0.8065,
1486
- "step": 1050
1487
- },
1488
- {
1489
- "epoch": 1.11,
1490
- "grad_norm": 1.0540192082846203,
1491
- "learning_rate": 7.053927081879505e-06,
1492
- "loss": 0.7956,
1493
- "step": 1055
1494
- },
1495
- {
1496
- "epoch": 1.12,
1497
- "grad_norm": 1.0552828635725824,
1498
- "learning_rate": 7.0284030155336315e-06,
1499
- "loss": 0.7945,
1500
- "step": 1060
1501
- },
1502
- {
1503
- "epoch": 1.13,
1504
- "grad_norm": 0.9810627289945896,
1505
- "learning_rate": 7.002815519964745e-06,
1506
- "loss": 0.7965,
1507
- "step": 1065
1508
- },
1509
- {
1510
- "epoch": 1.13,
1511
- "grad_norm": 1.0916102744452092,
1512
- "learning_rate": 6.977165395307215e-06,
1513
- "loss": 0.7991,
1514
- "step": 1070
1515
- },
1516
- {
1517
- "epoch": 1.14,
1518
- "grad_norm": 1.1543690326062077,
1519
- "learning_rate": 6.951453443653852e-06,
1520
- "loss": 0.7896,
1521
- "step": 1075
1522
- },
1523
- {
1524
- "epoch": 1.14,
1525
- "grad_norm": 1.1170103600405488,
1526
- "learning_rate": 6.9256804690308276e-06,
1527
- "loss": 0.7828,
1528
- "step": 1080
1529
- },
1530
- {
1531
- "epoch": 1.15,
1532
- "grad_norm": 1.0526733296614392,
1533
- "learning_rate": 6.899847277372538e-06,
1534
- "loss": 0.7923,
1535
- "step": 1085
1536
- },
1537
- {
1538
- "epoch": 1.15,
1539
- "grad_norm": 1.0770254342023697,
1540
- "learning_rate": 6.873954676496395e-06,
1541
- "loss": 0.8128,
1542
- "step": 1090
1543
- },
1544
- {
1545
- "epoch": 1.16,
1546
- "grad_norm": 1.037705594081886,
1547
- "learning_rate": 6.848003476077567e-06,
1548
- "loss": 0.7856,
1549
- "step": 1095
1550
- },
1551
- {
1552
- "epoch": 1.16,
1553
- "grad_norm": 1.0319807068181204,
1554
- "learning_rate": 6.8219944876236645e-06,
1555
- "loss": 0.7949,
1556
- "step": 1100
1557
- },
1558
- {
1559
- "epoch": 1.17,
1560
- "grad_norm": 1.0927555007584646,
1561
- "learning_rate": 6.795928524449354e-06,
1562
- "loss": 0.7941,
1563
- "step": 1105
1564
- },
1565
- {
1566
- "epoch": 1.17,
1567
- "grad_norm": 0.9869897993273156,
1568
- "learning_rate": 6.769806401650936e-06,
1569
- "loss": 0.7667,
1570
- "step": 1110
1571
- },
1572
- {
1573
- "epoch": 1.18,
1574
- "grad_norm": 1.0055956062759406,
1575
- "learning_rate": 6.743628936080852e-06,
1576
- "loss": 0.7855,
1577
- "step": 1115
1578
- },
1579
- {
1580
- "epoch": 1.18,
1581
- "grad_norm": 1.0283367881989096,
1582
- "learning_rate": 6.717396946322137e-06,
1583
- "loss": 0.7745,
1584
- "step": 1120
1585
- },
1586
- {
1587
- "epoch": 1.19,
1588
- "grad_norm": 1.0345829389670045,
1589
- "learning_rate": 6.6911112526628295e-06,
1590
- "loss": 0.7842,
1591
- "step": 1125
1592
- },
1593
- {
1594
- "epoch": 1.19,
1595
- "grad_norm": 1.0711135328845822,
1596
- "learning_rate": 6.664772677070316e-06,
1597
- "loss": 0.7558,
1598
- "step": 1130
1599
- },
1600
- {
1601
- "epoch": 1.2,
1602
- "grad_norm": 0.9877769296594265,
1603
- "learning_rate": 6.638382043165628e-06,
1604
- "loss": 0.7788,
1605
- "step": 1135
1606
- },
1607
- {
1608
- "epoch": 1.2,
1609
- "grad_norm": 1.131836138091609,
1610
- "learning_rate": 6.611940176197688e-06,
1611
- "loss": 0.7901,
1612
- "step": 1140
1613
- },
1614
- {
1615
- "epoch": 1.21,
1616
- "grad_norm": 1.058249641590972,
1617
- "learning_rate": 6.585447903017506e-06,
1618
- "loss": 0.7936,
1619
- "step": 1145
1620
- },
1621
- {
1622
- "epoch": 1.22,
1623
- "grad_norm": 1.073971008814511,
1624
- "learning_rate": 6.558906052052314e-06,
1625
- "loss": 0.7835,
1626
- "step": 1150
1627
- },
1628
- {
1629
- "epoch": 1.22,
1630
- "grad_norm": 1.0491301969369466,
1631
- "learning_rate": 6.532315453279673e-06,
1632
- "loss": 0.7902,
1633
- "step": 1155
1634
- },
1635
- {
1636
- "epoch": 1.23,
1637
- "grad_norm": 1.046297097483487,
1638
- "learning_rate": 6.505676938201512e-06,
1639
- "loss": 0.7767,
1640
- "step": 1160
1641
- },
1642
- {
1643
- "epoch": 1.23,
1644
- "grad_norm": 1.046022517875942,
1645
- "learning_rate": 6.478991339818128e-06,
1646
- "loss": 0.8091,
1647
- "step": 1165
1648
- },
1649
- {
1650
- "epoch": 1.24,
1651
- "grad_norm": 1.0086633248074561,
1652
- "learning_rate": 6.4522594926021355e-06,
1653
- "loss": 0.7797,
1654
- "step": 1170
1655
- },
1656
- {
1657
- "epoch": 1.24,
1658
- "grad_norm": 1.0965955454651117,
1659
- "learning_rate": 6.425482232472377e-06,
1660
- "loss": 0.7702,
1661
- "step": 1175
1662
- },
1663
- {
1664
- "epoch": 1.25,
1665
- "grad_norm": 1.0362189192150881,
1666
- "learning_rate": 6.3986603967677805e-06,
1667
- "loss": 0.7931,
1668
- "step": 1180
1669
- },
1670
- {
1671
- "epoch": 1.25,
1672
- "grad_norm": 1.110468197330772,
1673
- "learning_rate": 6.371794824221173e-06,
1674
- "loss": 0.7917,
1675
- "step": 1185
1676
- },
1677
- {
1678
- "epoch": 1.26,
1679
- "grad_norm": 1.0163659020071605,
1680
- "learning_rate": 6.344886354933058e-06,
1681
- "loss": 0.7886,
1682
- "step": 1190
1683
- },
1684
- {
1685
- "epoch": 1.26,
1686
- "grad_norm": 1.0115549227695064,
1687
- "learning_rate": 6.3179358303453386e-06,
1688
- "loss": 0.7511,
1689
- "step": 1195
1690
- },
1691
- {
1692
- "epoch": 1.27,
1693
- "grad_norm": 1.0872016119161863,
1694
- "learning_rate": 6.290944093215016e-06,
1695
- "loss": 0.8036,
1696
- "step": 1200
1697
- },
1698
- {
1699
- "epoch": 1.27,
1700
- "grad_norm": 1.0553500518484338,
1701
- "learning_rate": 6.263911987587822e-06,
1702
- "loss": 0.7938,
1703
- "step": 1205
1704
- },
1705
- {
1706
- "epoch": 1.28,
1707
- "grad_norm": 0.993815270148442,
1708
- "learning_rate": 6.236840358771837e-06,
1709
- "loss": 0.7788,
1710
- "step": 1210
1711
- },
1712
- {
1713
- "epoch": 1.28,
1714
- "grad_norm": 1.0605675582324252,
1715
- "learning_rate": 6.20973005331105e-06,
1716
- "loss": 0.7781,
1717
- "step": 1215
1718
- },
1719
- {
1720
- "epoch": 1.29,
1721
- "grad_norm": 1.0965085071552372,
1722
- "learning_rate": 6.1825819189588885e-06,
1723
- "loss": 0.7872,
1724
- "step": 1220
1725
- },
1726
- {
1727
- "epoch": 1.29,
1728
- "grad_norm": 1.040866195350916,
1729
- "learning_rate": 6.155396804651714e-06,
1730
- "loss": 0.7966,
1731
- "step": 1225
1732
- },
1733
- {
1734
- "epoch": 1.3,
1735
- "grad_norm": 1.0593376609536802,
1736
- "learning_rate": 6.128175560482264e-06,
1737
- "loss": 0.7832,
1738
- "step": 1230
1739
- },
1740
- {
1741
- "epoch": 1.31,
1742
- "grad_norm": 1.0081718313330637,
1743
- "learning_rate": 6.1009190376730785e-06,
1744
- "loss": 0.7772,
1745
- "step": 1235
1746
- },
1747
- {
1748
- "epoch": 1.31,
1749
- "grad_norm": 0.9892554397828908,
1750
- "learning_rate": 6.07362808854988e-06,
1751
- "loss": 0.7856,
1752
- "step": 1240
1753
- },
1754
- {
1755
- "epoch": 1.32,
1756
- "grad_norm": 1.0515874983049542,
1757
- "learning_rate": 6.046303566514919e-06,
1758
- "loss": 0.7812,
1759
- "step": 1245
1760
- },
1761
- {
1762
- "epoch": 1.32,
1763
- "grad_norm": 1.01738547568124,
1764
- "learning_rate": 6.018946326020287e-06,
1765
- "loss": 0.7824,
1766
- "step": 1250
1767
- },
1768
- {
1769
- "epoch": 1.33,
1770
- "grad_norm": 0.992994982201507,
1771
- "learning_rate": 5.991557222541201e-06,
1772
- "loss": 0.7842,
1773
- "step": 1255
1774
- },
1775
- {
1776
- "epoch": 1.33,
1777
- "grad_norm": 0.9928822859609259,
1778
- "learning_rate": 5.964137112549251e-06,
1779
- "loss": 0.7906,
1780
- "step": 1260
1781
- },
1782
- {
1783
- "epoch": 1.34,
1784
- "grad_norm": 1.0673862770846931,
1785
- "learning_rate": 5.9366868534856115e-06,
1786
- "loss": 0.7896,
1787
- "step": 1265
1788
- },
1789
- {
1790
- "epoch": 1.34,
1791
- "grad_norm": 1.0627251705995355,
1792
- "learning_rate": 5.909207303734241e-06,
1793
- "loss": 0.7965,
1794
- "step": 1270
1795
- },
1796
- {
1797
- "epoch": 1.35,
1798
- "grad_norm": 1.0050051635503012,
1799
- "learning_rate": 5.881699322595031e-06,
1800
- "loss": 0.7775,
1801
- "step": 1275
1802
- },
1803
- {
1804
- "epoch": 1.35,
1805
- "grad_norm": 1.0049258262531797,
1806
- "learning_rate": 5.854163770256934e-06,
1807
- "loss": 0.7659,
1808
- "step": 1280
1809
- },
1810
- {
1811
- "epoch": 1.36,
1812
- "grad_norm": 1.1097225296353777,
1813
- "learning_rate": 5.826601507771073e-06,
1814
- "loss": 0.7699,
1815
- "step": 1285
1816
- },
1817
- {
1818
- "epoch": 1.36,
1819
- "grad_norm": 1.0610730723756006,
1820
- "learning_rate": 5.799013397023806e-06,
1821
- "loss": 0.7996,
1822
- "step": 1290
1823
- },
1824
- {
1825
- "epoch": 1.37,
1826
- "grad_norm": 1.0285633823079718,
1827
- "learning_rate": 5.771400300709785e-06,
1828
- "loss": 0.7829,
1829
- "step": 1295
1830
- },
1831
- {
1832
- "epoch": 1.37,
1833
- "grad_norm": 1.0484599021027985,
1834
- "learning_rate": 5.743763082304973e-06,
1835
- "loss": 0.7619,
1836
- "step": 1300
1837
- },
1838
- {
1839
- "epoch": 1.38,
1840
- "grad_norm": 1.0137701786577156,
1841
- "learning_rate": 5.7161026060396375e-06,
1842
- "loss": 0.798,
1843
- "step": 1305
1844
- },
1845
- {
1846
- "epoch": 1.38,
1847
- "grad_norm": 1.0289414598602742,
1848
- "learning_rate": 5.688419736871341e-06,
1849
- "loss": 0.7827,
1850
- "step": 1310
1851
- },
1852
- {
1853
- "epoch": 1.39,
1854
- "grad_norm": 1.058376335913828,
1855
- "learning_rate": 5.660715340457874e-06,
1856
- "loss": 0.7921,
1857
- "step": 1315
1858
- },
1859
- {
1860
- "epoch": 1.39,
1861
- "grad_norm": 1.0011219088912342,
1862
- "learning_rate": 5.632990283130204e-06,
1863
- "loss": 0.781,
1864
- "step": 1320
1865
- },
1866
- {
1867
- "epoch": 1.4,
1868
- "grad_norm": 0.984264955084216,
1869
- "learning_rate": 5.605245431865368e-06,
1870
- "loss": 0.7772,
1871
- "step": 1325
1872
- },
1873
- {
1874
- "epoch": 1.41,
1875
- "grad_norm": 1.0151072044919451,
1876
- "learning_rate": 5.577481654259377e-06,
1877
- "loss": 0.7735,
1878
- "step": 1330
1879
- },
1880
- {
1881
- "epoch": 1.41,
1882
- "grad_norm": 1.063533843295668,
1883
- "learning_rate": 5.549699818500074e-06,
1884
- "loss": 0.7682,
1885
- "step": 1335
1886
- },
1887
- {
1888
- "epoch": 1.42,
1889
- "grad_norm": 1.0434635789190496,
1890
- "learning_rate": 5.521900793339989e-06,
1891
- "loss": 0.7915,
1892
- "step": 1340
1893
- },
1894
- {
1895
- "epoch": 1.42,
1896
- "grad_norm": 1.0587561050751115,
1897
- "learning_rate": 5.494085448069181e-06,
1898
- "loss": 0.7997,
1899
- "step": 1345
1900
- },
1901
- {
1902
- "epoch": 1.43,
1903
- "grad_norm": 1.0758864296233028,
1904
- "learning_rate": 5.466254652488036e-06,
1905
- "loss": 0.7964,
1906
- "step": 1350
1907
- },
1908
- {
1909
- "epoch": 1.43,
1910
- "grad_norm": 1.0556751372323996,
1911
- "learning_rate": 5.438409276880089e-06,
1912
- "loss": 0.8062,
1913
- "step": 1355
1914
- },
1915
- {
1916
- "epoch": 1.44,
1917
- "grad_norm": 0.9792859835280993,
1918
- "learning_rate": 5.410550191984798e-06,
1919
- "loss": 0.787,
1920
- "step": 1360
1921
- },
1922
- {
1923
- "epoch": 1.44,
1924
- "grad_norm": 1.0231438624972786,
1925
- "learning_rate": 5.3826782689703115e-06,
1926
- "loss": 0.7803,
1927
- "step": 1365
1928
- },
1929
- {
1930
- "epoch": 1.45,
1931
- "grad_norm": 1.0660534726358564,
1932
- "learning_rate": 5.354794379406242e-06,
1933
- "loss": 0.78,
1934
- "step": 1370
1935
- },
1936
- {
1937
- "epoch": 1.45,
1938
- "grad_norm": 0.9527414539128428,
1939
- "learning_rate": 5.3268993952363936e-06,
1940
- "loss": 0.796,
1941
- "step": 1375
1942
- },
1943
- {
1944
- "epoch": 1.46,
1945
- "grad_norm": 0.9870931434726852,
1946
- "learning_rate": 5.29899418875151e-06,
1947
- "loss": 0.7652,
1948
- "step": 1380
1949
- },
1950
- {
1951
- "epoch": 1.46,
1952
- "grad_norm": 1.0537299945885146,
1953
- "learning_rate": 5.271079632561992e-06,
1954
- "loss": 0.7854,
1955
- "step": 1385
1956
- },
1957
- {
1958
- "epoch": 1.47,
1959
- "grad_norm": 1.1396368040574916,
1960
- "learning_rate": 5.243156599570606e-06,
1961
- "loss": 0.7617,
1962
- "step": 1390
1963
- },
1964
- {
1965
- "epoch": 1.47,
1966
- "grad_norm": 1.0924704024745873,
1967
- "learning_rate": 5.2152259629451986e-06,
1968
- "loss": 0.7713,
1969
- "step": 1395
1970
- },
1971
- {
1972
- "epoch": 1.48,
1973
- "grad_norm": 1.021493417245078,
1974
- "learning_rate": 5.18728859609138e-06,
1975
- "loss": 0.7609,
1976
- "step": 1400
1977
- },
1978
- {
1979
- "epoch": 1.48,
1980
- "grad_norm": 1.0148194958691719,
1981
- "learning_rate": 5.159345372625223e-06,
1982
- "loss": 0.7788,
1983
- "step": 1405
1984
- },
1985
- {
1986
- "epoch": 1.49,
1987
- "grad_norm": 1.0402765811164951,
1988
- "learning_rate": 5.131397166345938e-06,
1989
- "loss": 0.7599,
1990
- "step": 1410
1991
- },
1992
- {
1993
- "epoch": 1.5,
1994
- "grad_norm": 0.9966250584272072,
1995
- "learning_rate": 5.103444851208549e-06,
1996
- "loss": 0.7874,
1997
- "step": 1415
1998
- },
1999
- {
2000
- "epoch": 1.5,
2001
- "grad_norm": 0.9871275158697829,
2002
- "learning_rate": 5.075489301296567e-06,
2003
- "loss": 0.7566,
2004
- "step": 1420
2005
- },
2006
- {
2007
- "epoch": 1.51,
2008
- "grad_norm": 1.0896451679213162,
2009
- "learning_rate": 5.047531390794661e-06,
2010
- "loss": 0.7699,
2011
- "step": 1425
2012
- },
2013
- {
2014
- "epoch": 1.51,
2015
- "grad_norm": 1.1203863877988638,
2016
- "learning_rate": 5.019571993961307e-06,
2017
- "loss": 0.8088,
2018
- "step": 1430
2019
- },
2020
- {
2021
- "epoch": 1.52,
2022
- "grad_norm": 1.03311513179617,
2023
- "learning_rate": 4.9916119851014664e-06,
2024
- "loss": 0.7739,
2025
- "step": 1435
2026
- },
2027
- {
2028
- "epoch": 1.52,
2029
- "grad_norm": 1.0389351009988612,
2030
- "learning_rate": 4.96365223853924e-06,
2031
- "loss": 0.7816,
2032
- "step": 1440
2033
- },
2034
- {
2035
- "epoch": 1.53,
2036
- "grad_norm": 0.9960641498632878,
2037
- "learning_rate": 4.93569362859052e-06,
2038
- "loss": 0.775,
2039
- "step": 1445
2040
- },
2041
- {
2042
- "epoch": 1.53,
2043
- "grad_norm": 0.9388823495229471,
2044
- "learning_rate": 4.907737029535664e-06,
2045
- "loss": 0.756,
2046
- "step": 1450
2047
- },
2048
- {
2049
- "epoch": 1.54,
2050
- "grad_norm": 1.0662538022442485,
2051
- "learning_rate": 4.8797833155921396e-06,
2052
- "loss": 0.7992,
2053
- "step": 1455
2054
- },
2055
- {
2056
- "epoch": 1.54,
2057
- "grad_norm": 1.0350212904727674,
2058
- "learning_rate": 4.8518333608872015e-06,
2059
- "loss": 0.7595,
2060
- "step": 1460
2061
- },
2062
- {
2063
- "epoch": 1.55,
2064
- "grad_norm": 0.9967538128228846,
2065
- "learning_rate": 4.823888039430551e-06,
2066
- "loss": 0.7582,
2067
- "step": 1465
2068
- },
2069
- {
2070
- "epoch": 1.55,
2071
- "grad_norm": 1.0139079612075497,
2072
- "learning_rate": 4.795948225087001e-06,
2073
- "loss": 0.7709,
2074
- "step": 1470
2075
- },
2076
- {
2077
- "epoch": 1.56,
2078
- "grad_norm": 1.0510044388149635,
2079
- "learning_rate": 4.7680147915491585e-06,
2080
- "loss": 0.7692,
2081
- "step": 1475
2082
- },
2083
- {
2084
- "epoch": 1.56,
2085
- "grad_norm": 1.0641353890612333,
2086
- "learning_rate": 4.740088612310096e-06,
2087
- "loss": 0.7847,
2088
- "step": 1480
2089
- },
2090
- {
2091
- "epoch": 1.57,
2092
- "grad_norm": 1.0192435995305715,
2093
- "learning_rate": 4.7121705606360424e-06,
2094
- "loss": 0.7732,
2095
- "step": 1485
2096
- },
2097
- {
2098
- "epoch": 1.57,
2099
- "grad_norm": 1.0076325415256413,
2100
- "learning_rate": 4.684261509539072e-06,
2101
- "loss": 0.7701,
2102
- "step": 1490
2103
- },
2104
- {
2105
- "epoch": 1.58,
2106
- "grad_norm": 0.9707102286396411,
2107
- "learning_rate": 4.65636233174981e-06,
2108
- "loss": 0.77,
2109
- "step": 1495
2110
- },
2111
- {
2112
- "epoch": 1.59,
2113
- "grad_norm": 1.0835636202474823,
2114
- "learning_rate": 4.628473899690133e-06,
2115
- "loss": 0.7849,
2116
- "step": 1500
2117
- },
2118
- {
2119
- "epoch": 1.59,
2120
- "grad_norm": 1.0157410126136626,
2121
- "learning_rate": 4.600597085445894e-06,
2122
- "loss": 0.784,
2123
- "step": 1505
2124
- },
2125
- {
2126
- "epoch": 1.6,
2127
- "grad_norm": 1.0616186913926178,
2128
- "learning_rate": 4.572732760739653e-06,
2129
- "loss": 0.7785,
2130
- "step": 1510
2131
- },
2132
- {
2133
- "epoch": 1.6,
2134
- "grad_norm": 1.006516145178769,
2135
- "learning_rate": 4.5448817969034165e-06,
2136
- "loss": 0.7753,
2137
- "step": 1515
2138
- },
2139
- {
2140
- "epoch": 1.61,
2141
- "grad_norm": 1.0480529823653495,
2142
- "learning_rate": 4.517045064851386e-06,
2143
- "loss": 0.7989,
2144
- "step": 1520
2145
- },
2146
- {
2147
- "epoch": 1.61,
2148
- "grad_norm": 1.0432567441250045,
2149
- "learning_rate": 4.489223435052732e-06,
2150
- "loss": 0.7946,
2151
- "step": 1525
2152
- },
2153
- {
2154
- "epoch": 1.62,
2155
- "grad_norm": 1.0461342178531015,
2156
- "learning_rate": 4.461417777504363e-06,
2157
- "loss": 0.7676,
2158
- "step": 1530
2159
- },
2160
- {
2161
- "epoch": 1.62,
2162
- "grad_norm": 1.0045382622138492,
2163
- "learning_rate": 4.433628961703733e-06,
2164
- "loss": 0.7651,
2165
- "step": 1535
2166
- },
2167
- {
2168
- "epoch": 1.63,
2169
- "grad_norm": 0.9890094489435823,
2170
- "learning_rate": 4.405857856621644e-06,
2171
- "loss": 0.7943,
2172
- "step": 1540
2173
- },
2174
- {
2175
- "epoch": 1.63,
2176
- "grad_norm": 1.0127639919495397,
2177
- "learning_rate": 4.378105330675074e-06,
2178
- "loss": 0.7895,
2179
- "step": 1545
2180
- },
2181
- {
2182
- "epoch": 1.64,
2183
- "grad_norm": 1.0398544121817734,
2184
- "learning_rate": 4.350372251700025e-06,
2185
- "loss": 0.8004,
2186
- "step": 1550
2187
- },
2188
- {
2189
- "epoch": 1.64,
2190
- "grad_norm": 1.037857459368961,
2191
- "learning_rate": 4.322659486924373e-06,
2192
- "loss": 0.7963,
2193
- "step": 1555
2194
- },
2195
- {
2196
- "epoch": 1.65,
2197
- "grad_norm": 1.106103919813531,
2198
- "learning_rate": 4.294967902940768e-06,
2199
- "loss": 0.787,
2200
- "step": 1560
2201
- },
2202
- {
2203
- "epoch": 1.65,
2204
- "grad_norm": 1.0865617469424886,
2205
- "learning_rate": 4.267298365679522e-06,
2206
- "loss": 0.788,
2207
- "step": 1565
2208
- },
2209
- {
2210
- "epoch": 1.66,
2211
- "grad_norm": 1.0303226290700802,
2212
- "learning_rate": 4.239651740381534e-06,
2213
- "loss": 0.7642,
2214
- "step": 1570
2215
- },
2216
- {
2217
- "epoch": 1.66,
2218
- "grad_norm": 1.0512505166055992,
2219
- "learning_rate": 4.212028891571237e-06,
2220
- "loss": 0.7832,
2221
- "step": 1575
2222
- },
2223
- {
2224
- "epoch": 1.67,
2225
- "grad_norm": 1.0750316874597787,
2226
- "learning_rate": 4.184430683029552e-06,
2227
- "loss": 0.7599,
2228
- "step": 1580
2229
- },
2230
- {
2231
- "epoch": 1.68,
2232
- "grad_norm": 1.0622608820174235,
2233
- "learning_rate": 4.156857977766896e-06,
2234
- "loss": 0.7841,
2235
- "step": 1585
2236
- },
2237
- {
2238
- "epoch": 1.68,
2239
- "grad_norm": 1.0023528643121005,
2240
- "learning_rate": 4.129311637996182e-06,
2241
- "loss": 0.7845,
2242
- "step": 1590
2243
- },
2244
- {
2245
- "epoch": 1.69,
2246
- "grad_norm": 1.0597451506484419,
2247
- "learning_rate": 4.101792525105857e-06,
2248
- "loss": 0.7802,
2249
- "step": 1595
2250
- },
2251
- {
2252
- "epoch": 1.69,
2253
- "grad_norm": 0.9622973096022323,
2254
- "learning_rate": 4.0743014996329764e-06,
2255
- "loss": 0.7678,
2256
- "step": 1600
2257
- },
2258
- {
2259
- "epoch": 1.7,
2260
- "grad_norm": 1.051095411122212,
2261
- "learning_rate": 4.046839421236276e-06,
2262
- "loss": 0.7972,
2263
- "step": 1605
2264
- },
2265
- {
2266
- "epoch": 1.7,
2267
- "grad_norm": 1.0082128589578265,
2268
- "learning_rate": 4.019407148669312e-06,
2269
- "loss": 0.7948,
2270
- "step": 1610
2271
- },
2272
- {
2273
- "epoch": 1.71,
2274
- "grad_norm": 1.0901759578931909,
2275
- "learning_rate": 3.992005539753592e-06,
2276
- "loss": 0.7914,
2277
- "step": 1615
2278
- },
2279
- {
2280
- "epoch": 1.71,
2281
- "grad_norm": 1.0584302499373435,
2282
- "learning_rate": 3.964635451351758e-06,
2283
- "loss": 0.7821,
2284
- "step": 1620
2285
- },
2286
- {
2287
- "epoch": 1.72,
2288
- "grad_norm": 1.043189384648134,
2289
- "learning_rate": 3.937297739340783e-06,
2290
- "loss": 0.778,
2291
- "step": 1625
2292
- },
2293
- {
2294
- "epoch": 1.72,
2295
- "grad_norm": 1.0245392793145456,
2296
- "learning_rate": 3.909993258585219e-06,
2297
- "loss": 0.7908,
2298
- "step": 1630
2299
- },
2300
- {
2301
- "epoch": 1.73,
2302
- "grad_norm": 1.0082519645854728,
2303
- "learning_rate": 3.882722862910458e-06,
2304
- "loss": 0.7793,
2305
- "step": 1635
2306
- },
2307
- {
2308
- "epoch": 1.73,
2309
- "grad_norm": 1.0211341337802105,
2310
- "learning_rate": 3.8554874050760345e-06,
2311
- "loss": 0.8042,
2312
- "step": 1640
2313
- },
2314
- {
2315
- "epoch": 1.74,
2316
- "grad_norm": 0.9920127978660441,
2317
- "learning_rate": 3.828287736748957e-06,
2318
- "loss": 0.758,
2319
- "step": 1645
2320
- },
2321
- {
2322
- "epoch": 1.74,
2323
- "grad_norm": 1.0187229111502758,
2324
- "learning_rate": 3.8011247084770754e-06,
2325
- "loss": 0.7986,
2326
- "step": 1650
2327
- },
2328
- {
2329
- "epoch": 1.75,
2330
- "grad_norm": 0.9982295207578855,
2331
- "learning_rate": 3.773999169662489e-06,
2332
- "loss": 0.7623,
2333
- "step": 1655
2334
- },
2335
- {
2336
- "epoch": 1.75,
2337
- "grad_norm": 1.025180441312379,
2338
- "learning_rate": 3.746911968534982e-06,
2339
- "loss": 0.7454,
2340
- "step": 1660
2341
- },
2342
- {
2343
- "epoch": 1.76,
2344
- "grad_norm": 0.9884338430346545,
2345
- "learning_rate": 3.7198639521254988e-06,
2346
- "loss": 0.7671,
2347
- "step": 1665
2348
- },
2349
- {
2350
- "epoch": 1.76,
2351
- "grad_norm": 0.9685352318412103,
2352
- "learning_rate": 3.6928559662396574e-06,
2353
- "loss": 0.7583,
2354
- "step": 1670
2355
- },
2356
- {
2357
- "epoch": 1.77,
2358
- "grad_norm": 1.029404957630594,
2359
- "learning_rate": 3.6658888554312967e-06,
2360
- "loss": 0.7868,
2361
- "step": 1675
2362
- },
2363
- {
2364
- "epoch": 1.78,
2365
- "grad_norm": 0.9921023940146521,
2366
- "learning_rate": 3.6389634629760763e-06,
2367
- "loss": 0.7555,
2368
- "step": 1680
2369
- },
2370
- {
2371
- "epoch": 1.78,
2372
- "grad_norm": 1.017350986680598,
2373
- "learning_rate": 3.612080630845096e-06,
2374
- "loss": 0.7905,
2375
- "step": 1685
2376
- },
2377
- {
2378
- "epoch": 1.79,
2379
- "grad_norm": 1.0430603602540587,
2380
- "learning_rate": 3.5852411996785776e-06,
2381
- "loss": 0.7947,
2382
- "step": 1690
2383
- },
2384
- {
2385
- "epoch": 1.79,
2386
- "grad_norm": 0.9737056004061376,
2387
- "learning_rate": 3.558446008759569e-06,
2388
- "loss": 0.7789,
2389
- "step": 1695
2390
- },
2391
- {
2392
- "epoch": 1.8,
2393
- "grad_norm": 1.0212119960635129,
2394
- "learning_rate": 3.5316958959876985e-06,
2395
- "loss": 0.7671,
2396
- "step": 1700
2397
- },
2398
- {
2399
- "epoch": 1.8,
2400
- "grad_norm": 1.0072141418910243,
2401
- "learning_rate": 3.504991697852983e-06,
2402
- "loss": 0.7844,
2403
- "step": 1705
2404
- },
2405
- {
2406
- "epoch": 1.81,
2407
- "grad_norm": 1.059809521658242,
2408
- "learning_rate": 3.4783342494096627e-06,
2409
- "loss": 0.7845,
2410
- "step": 1710
2411
- },
2412
- {
2413
- "epoch": 1.81,
2414
- "grad_norm": 1.032182317108509,
2415
- "learning_rate": 3.451724384250091e-06,
2416
- "loss": 0.7792,
2417
- "step": 1715
2418
- },
2419
- {
2420
- "epoch": 1.82,
2421
- "grad_norm": 0.9779053888998924,
2422
- "learning_rate": 3.4251629344786675e-06,
2423
- "loss": 0.7591,
2424
- "step": 1720
2425
- },
2426
- {
2427
- "epoch": 1.82,
2428
- "grad_norm": 1.0116163318504925,
2429
- "learning_rate": 3.398650730685813e-06,
2430
- "loss": 0.7556,
2431
- "step": 1725
2432
- },
2433
- {
2434
- "epoch": 1.83,
2435
- "grad_norm": 1.0511489470052602,
2436
- "learning_rate": 3.372188601922006e-06,
2437
- "loss": 0.7637,
2438
- "step": 1730
2439
- },
2440
- {
2441
- "epoch": 1.83,
2442
- "grad_norm": 1.0172930500825146,
2443
- "learning_rate": 3.3457773756718513e-06,
2444
- "loss": 0.7696,
2445
- "step": 1735
2446
- },
2447
- {
2448
- "epoch": 1.84,
2449
- "grad_norm": 1.039493994412079,
2450
- "learning_rate": 3.3194178778282046e-06,
2451
- "loss": 0.7931,
2452
- "step": 1740
2453
- },
2454
- {
2455
- "epoch": 1.84,
2456
- "grad_norm": 1.033662637919394,
2457
- "learning_rate": 3.293110932666349e-06,
2458
- "loss": 0.7692,
2459
- "step": 1745
2460
- },
2461
- {
2462
- "epoch": 1.85,
2463
- "grad_norm": 1.0584694868797393,
2464
- "learning_rate": 3.2668573628182145e-06,
2465
- "loss": 0.7792,
2466
- "step": 1750
2467
- },
2468
- {
2469
- "epoch": 1.85,
2470
- "grad_norm": 0.994626270021195,
2471
- "learning_rate": 3.2406579892466582e-06,
2472
- "loss": 0.7682,
2473
- "step": 1755
2474
- },
2475
- {
2476
- "epoch": 1.86,
2477
- "grad_norm": 0.9270237802993908,
2478
- "learning_rate": 3.2145136312197943e-06,
2479
- "loss": 0.7552,
2480
- "step": 1760
2481
- },
2482
- {
2483
- "epoch": 1.87,
2484
- "grad_norm": 2.0595234604236357,
2485
- "learning_rate": 3.18842510628537e-06,
2486
- "loss": 0.7749,
2487
- "step": 1765
2488
- },
2489
- {
2490
- "epoch": 1.87,
2491
- "grad_norm": 1.0396319816767299,
2492
- "learning_rate": 3.162393230245203e-06,
2493
- "loss": 0.804,
2494
- "step": 1770
2495
- },
2496
- {
2497
- "epoch": 1.88,
2498
- "grad_norm": 1.0214462086054552,
2499
- "learning_rate": 3.1364188171296677e-06,
2500
- "loss": 0.7744,
2501
- "step": 1775
2502
- },
2503
- {
2504
- "epoch": 1.88,
2505
- "grad_norm": 1.0145502545771508,
2506
- "learning_rate": 3.110502679172246e-06,
2507
- "loss": 0.7824,
2508
- "step": 1780
2509
- },
2510
- {
2511
- "epoch": 1.89,
2512
- "grad_norm": 1.0196641711891408,
2513
- "learning_rate": 3.084645626784124e-06,
2514
- "loss": 0.7745,
2515
- "step": 1785
2516
- },
2517
- {
2518
- "epoch": 1.89,
2519
- "grad_norm": 1.0197064636159427,
2520
- "learning_rate": 3.058848468528852e-06,
2521
- "loss": 0.8031,
2522
- "step": 1790
2523
- },
2524
- {
2525
- "epoch": 1.9,
2526
- "grad_norm": 0.9907125667454302,
2527
- "learning_rate": 3.03311201109706e-06,
2528
- "loss": 0.7919,
2529
- "step": 1795
2530
- },
2531
- {
2532
- "epoch": 1.9,
2533
- "grad_norm": 1.017942513059757,
2534
- "learning_rate": 3.0074370592812286e-06,
2535
- "loss": 0.7907,
2536
- "step": 1800
2537
- },
2538
- {
2539
- "epoch": 1.91,
2540
- "grad_norm": 1.0821499695866912,
2541
- "learning_rate": 2.9818244159505265e-06,
2542
- "loss": 0.7901,
2543
- "step": 1805
2544
- },
2545
- {
2546
- "epoch": 1.91,
2547
- "grad_norm": 0.9934394662674368,
2548
- "learning_rate": 2.956274882025706e-06,
2549
- "loss": 0.7638,
2550
- "step": 1810
2551
- },
2552
- {
2553
- "epoch": 1.92,
2554
- "grad_norm": 1.0313411208961847,
2555
- "learning_rate": 2.930789256454052e-06,
2556
- "loss": 0.7553,
2557
- "step": 1815
2558
- },
2559
- {
2560
- "epoch": 1.92,
2561
- "grad_norm": 0.9950833531614097,
2562
- "learning_rate": 2.905368336184406e-06,
2563
- "loss": 0.7576,
2564
- "step": 1820
2565
- },
2566
- {
2567
- "epoch": 1.93,
2568
- "grad_norm": 0.9936896686220547,
2569
- "learning_rate": 2.8800129161422365e-06,
2570
- "loss": 0.7671,
2571
- "step": 1825
2572
- },
2573
- {
2574
- "epoch": 1.93,
2575
- "grad_norm": 0.9909860465997411,
2576
- "learning_rate": 2.8547237892047852e-06,
2577
- "loss": 0.74,
2578
- "step": 1830
2579
- },
2580
- {
2581
- "epoch": 1.94,
2582
- "grad_norm": 0.9788752840880554,
2583
- "learning_rate": 2.8295017461762806e-06,
2584
- "loss": 0.767,
2585
- "step": 1835
2586
- },
2587
- {
2588
- "epoch": 1.94,
2589
- "grad_norm": 0.9764110020200104,
2590
- "learning_rate": 2.804347575763193e-06,
2591
- "loss": 0.7668,
2592
- "step": 1840
2593
- },
2594
- {
2595
- "epoch": 1.95,
2596
- "grad_norm": 0.9772254707929505,
2597
- "learning_rate": 2.7792620645495917e-06,
2598
- "loss": 0.7425,
2599
- "step": 1845
2600
- },
2601
- {
2602
- "epoch": 1.96,
2603
- "grad_norm": 1.0000854462976456,
2604
- "learning_rate": 2.7542459969725215e-06,
2605
- "loss": 0.7466,
2606
- "step": 1850
2607
- },
2608
- {
2609
- "epoch": 1.96,
2610
- "grad_norm": 1.0352323998365711,
2611
- "learning_rate": 2.729300155297504e-06,
2612
- "loss": 0.771,
2613
- "step": 1855
2614
- },
2615
- {
2616
- "epoch": 1.97,
2617
- "grad_norm": 0.9811051893834364,
2618
- "learning_rate": 2.704425319594049e-06,
2619
- "loss": 0.7778,
2620
- "step": 1860
2621
- },
2622
- {
2623
- "epoch": 1.97,
2624
- "grad_norm": 1.0284677234046133,
2625
- "learning_rate": 2.6796222677112825e-06,
2626
- "loss": 0.7796,
2627
- "step": 1865
2628
- },
2629
- {
2630
- "epoch": 1.98,
2631
- "grad_norm": 0.9664217044137716,
2632
- "learning_rate": 2.6548917752535997e-06,
2633
- "loss": 0.771,
2634
- "step": 1870
2635
- },
2636
- {
2637
- "epoch": 1.98,
2638
- "grad_norm": 1.0008524753186703,
2639
- "learning_rate": 2.6302346155564385e-06,
2640
- "loss": 0.7963,
2641
- "step": 1875
2642
- },
2643
- {
2644
- "epoch": 1.99,
2645
- "grad_norm": 1.0088045948631796,
2646
- "learning_rate": 2.6056515596620715e-06,
2647
- "loss": 0.7571,
2648
- "step": 1880
2649
- },
2650
- {
2651
- "epoch": 1.99,
2652
- "grad_norm": 0.9727997698934588,
2653
- "learning_rate": 2.581143376295516e-06,
2654
- "loss": 0.7968,
2655
- "step": 1885
2656
- },
2657
- {
2658
- "epoch": 2.0,
2659
- "grad_norm": 0.9760428822299934,
2660
- "learning_rate": 2.556710831840481e-06,
2661
- "loss": 0.7829,
2662
- "step": 1890
2663
- },
2664
- {
2665
- "epoch": 2.0,
2666
- "grad_norm": 1.1893585643467264,
2667
- "learning_rate": 2.5323546903154074e-06,
2668
- "loss": 0.7363,
2669
- "step": 1895
2670
- },
2671
- {
2672
- "epoch": 2.01,
2673
- "grad_norm": 1.0408498899558132,
2674
- "learning_rate": 2.508075713349575e-06,
2675
- "loss": 0.683,
2676
- "step": 1900
2677
- },
2678
- {
2679
- "epoch": 2.01,
2680
- "grad_norm": 1.0852218097728863,
2681
- "learning_rate": 2.483874660159294e-06,
2682
- "loss": 0.6388,
2683
- "step": 1905
2684
- },
2685
- {
2686
- "epoch": 2.02,
2687
- "grad_norm": 1.0636193658435114,
2688
- "learning_rate": 2.45975228752415e-06,
2689
- "loss": 0.6785,
2690
- "step": 1910
2691
- },
2692
- {
2693
- "epoch": 2.02,
2694
- "grad_norm": 1.05164052954354,
2695
- "learning_rate": 2.435709349763354e-06,
2696
- "loss": 0.7024,
2697
- "step": 1915
2698
- },
2699
- {
2700
- "epoch": 2.03,
2701
- "grad_norm": 1.0744751292672923,
2702
- "learning_rate": 2.4117465987121357e-06,
2703
- "loss": 0.6714,
2704
- "step": 1920
2705
- },
2706
- {
2707
- "epoch": 2.03,
2708
- "grad_norm": 1.0221167769747221,
2709
- "learning_rate": 2.387864783698258e-06,
2710
- "loss": 0.6441,
2711
- "step": 1925
2712
- },
2713
- {
2714
- "epoch": 2.04,
2715
- "grad_norm": 1.0453109653021675,
2716
- "learning_rate": 2.3640646515185596e-06,
2717
- "loss": 0.6668,
2718
- "step": 1930
2719
- },
2720
- {
2721
- "epoch": 2.04,
2722
- "grad_norm": 1.0035196656143317,
2723
- "learning_rate": 2.3403469464156235e-06,
2724
- "loss": 0.6711,
2725
- "step": 1935
2726
- },
2727
- {
2728
- "epoch": 2.05,
2729
- "grad_norm": 1.0614923887712562,
2730
- "learning_rate": 2.31671241005449e-06,
2731
- "loss": 0.6801,
2732
- "step": 1940
2733
- },
2734
- {
2735
- "epoch": 2.06,
2736
- "grad_norm": 1.0457688195463548,
2737
- "learning_rate": 2.2931617814994704e-06,
2738
- "loss": 0.6676,
2739
- "step": 1945
2740
- },
2741
- {
2742
- "epoch": 2.06,
2743
- "grad_norm": 1.094973586743587,
2744
- "learning_rate": 2.269695797191032e-06,
2745
- "loss": 0.6467,
2746
- "step": 1950
2747
- },
2748
- {
2749
- "epoch": 2.07,
2750
- "grad_norm": 1.0312304548353073,
2751
- "learning_rate": 2.2463151909227804e-06,
2752
- "loss": 0.6626,
2753
- "step": 1955
2754
- },
2755
- {
2756
- "epoch": 2.07,
2757
- "grad_norm": 1.0435526510546405,
2758
- "learning_rate": 2.223020693818495e-06,
2759
- "loss": 0.6565,
2760
- "step": 1960
2761
- },
2762
- {
2763
- "epoch": 2.08,
2764
- "grad_norm": 1.0361388218534178,
2765
- "learning_rate": 2.1998130343092866e-06,
2766
- "loss": 0.655,
2767
- "step": 1965
2768
- },
2769
- {
2770
- "epoch": 2.08,
2771
- "grad_norm": 1.071971382261616,
2772
- "learning_rate": 2.176692938110801e-06,
2773
- "loss": 0.6628,
2774
- "step": 1970
2775
- },
2776
- {
2777
- "epoch": 2.09,
2778
- "grad_norm": 1.0449189624346316,
2779
- "learning_rate": 2.1536611282005374e-06,
2780
- "loss": 0.6742,
2781
- "step": 1975
2782
- },
2783
- {
2784
- "epoch": 2.09,
2785
- "grad_norm": 1.0076278447431801,
2786
- "learning_rate": 2.130718324795234e-06,
2787
- "loss": 0.6615,
2788
- "step": 1980
2789
- },
2790
- {
2791
- "epoch": 2.1,
2792
- "grad_norm": 1.044357139317297,
2793
- "learning_rate": 2.107865245328354e-06,
2794
- "loss": 0.6707,
2795
- "step": 1985
2796
- },
2797
- {
2798
- "epoch": 2.1,
2799
- "grad_norm": 1.0155250644507565,
2800
- "learning_rate": 2.0851026044276405e-06,
2801
- "loss": 0.6701,
2802
- "step": 1990
2803
- },
2804
- {
2805
- "epoch": 2.11,
2806
- "grad_norm": 1.012020172763002,
2807
- "learning_rate": 2.0624311138927795e-06,
2808
- "loss": 0.6531,
2809
- "step": 1995
2810
- },
2811
- {
2812
- "epoch": 2.11,
2813
- "grad_norm": 1.0209851165233697,
2814
- "learning_rate": 2.0398514826731326e-06,
2815
- "loss": 0.6685,
2816
- "step": 2000
2817
- },
2818
- {
2819
- "epoch": 2.12,
2820
- "grad_norm": 1.0147123852944229,
2821
- "learning_rate": 2.017364416845579e-06,
2822
- "loss": 0.6506,
2823
- "step": 2005
2824
- },
2825
- {
2826
- "epoch": 2.12,
2827
- "grad_norm": 1.06994559921509,
2828
- "learning_rate": 1.9949706195924235e-06,
2829
- "loss": 0.6743,
2830
- "step": 2010
2831
- },
2832
- {
2833
- "epoch": 2.13,
2834
- "grad_norm": 0.9930487524595831,
2835
- "learning_rate": 1.97267079117942e-06,
2836
- "loss": 0.6596,
2837
- "step": 2015
2838
- },
2839
- {
2840
- "epoch": 2.13,
2841
- "grad_norm": 1.0334858708046972,
2842
- "learning_rate": 1.950465628933863e-06,
2843
- "loss": 0.6679,
2844
- "step": 2020
2845
- },
2846
- {
2847
- "epoch": 2.14,
2848
- "grad_norm": 1.060064879245556,
2849
- "learning_rate": 1.9283558272227866e-06,
2850
- "loss": 0.6749,
2851
- "step": 2025
2852
- },
2853
- {
2854
- "epoch": 2.15,
2855
- "grad_norm": 1.0171368650427,
2856
- "learning_rate": 1.9063420774312509e-06,
2857
- "loss": 0.6703,
2858
- "step": 2030
2859
- },
2860
- {
2861
- "epoch": 2.15,
2862
- "grad_norm": 0.9646165360014197,
2863
- "learning_rate": 1.8844250679407272e-06,
2864
- "loss": 0.6878,
2865
- "step": 2035
2866
- },
2867
- {
2868
- "epoch": 2.16,
2869
- "grad_norm": 1.0209055430674492,
2870
- "learning_rate": 1.862605484107562e-06,
2871
- "loss": 0.7052,
2872
- "step": 2040
2873
- },
2874
- {
2875
- "epoch": 2.16,
2876
- "grad_norm": 1.0216869737250995,
2877
- "learning_rate": 1.840884008241549e-06,
2878
- "loss": 0.6778,
2879
- "step": 2045
2880
- },
2881
- {
2882
- "epoch": 2.17,
2883
- "grad_norm": 0.990030094537176,
2884
- "learning_rate": 1.819261319584602e-06,
2885
- "loss": 0.675,
2886
- "step": 2050
2887
- },
2888
- {
2889
- "epoch": 2.17,
2890
- "grad_norm": 0.9972968188321764,
2891
- "learning_rate": 1.7977380942895007e-06,
2892
- "loss": 0.6832,
2893
- "step": 2055
2894
- },
2895
- {
2896
- "epoch": 2.18,
2897
- "grad_norm": 1.002919858574642,
2898
- "learning_rate": 1.7763150053987532e-06,
2899
- "loss": 0.6669,
2900
- "step": 2060
2901
- },
2902
- {
2903
- "epoch": 2.18,
2904
- "grad_norm": 1.040641077805689,
2905
- "learning_rate": 1.7549927228235547e-06,
2906
- "loss": 0.6874,
2907
- "step": 2065
2908
- },
2909
- {
2910
- "epoch": 2.19,
2911
- "grad_norm": 1.0136593089712416,
2912
- "learning_rate": 1.7337719133228308e-06,
2913
- "loss": 0.6662,
2914
- "step": 2070
2915
- },
2916
- {
2917
- "epoch": 2.19,
2918
- "grad_norm": 1.0032381970613455,
2919
- "learning_rate": 1.7126532404823898e-06,
2920
- "loss": 0.657,
2921
- "step": 2075
2922
- },
2923
- {
2924
- "epoch": 2.2,
2925
- "grad_norm": 1.0107311218156156,
2926
- "learning_rate": 1.6916373646941774e-06,
2927
- "loss": 0.6706,
2928
- "step": 2080
2929
- },
2930
- {
2931
- "epoch": 2.2,
2932
- "grad_norm": 1.0313882769598175,
2933
- "learning_rate": 1.6707249431356188e-06,
2934
- "loss": 0.6803,
2935
- "step": 2085
2936
- },
2937
- {
2938
- "epoch": 2.21,
2939
- "grad_norm": 1.0013867402651844,
2940
- "learning_rate": 1.6499166297490716e-06,
2941
- "loss": 0.6896,
2942
- "step": 2090
2943
- },
2944
- {
2945
- "epoch": 2.21,
2946
- "grad_norm": 0.9974367112606389,
2947
- "learning_rate": 1.6292130752213747e-06,
2948
- "loss": 0.6773,
2949
- "step": 2095
2950
- },
2951
- {
2952
- "epoch": 2.22,
2953
- "grad_norm": 1.0457782650116,
2954
- "learning_rate": 1.6086149269635081e-06,
2955
- "loss": 0.668,
2956
- "step": 2100
2957
- },
2958
- {
2959
- "epoch": 2.22,
2960
- "grad_norm": 0.9930241935385495,
2961
- "learning_rate": 1.5881228290903367e-06,
2962
- "loss": 0.6508,
2963
- "step": 2105
2964
- },
2965
- {
2966
- "epoch": 2.23,
2967
- "grad_norm": 1.0059354322817335,
2968
- "learning_rate": 1.5677374224004793e-06,
2969
- "loss": 0.6529,
2970
- "step": 2110
2971
- },
2972
- {
2973
- "epoch": 2.24,
2974
- "grad_norm": 1.0338579100235163,
2975
- "learning_rate": 1.547459344356262e-06,
2976
- "loss": 0.6614,
2977
- "step": 2115
2978
- },
2979
- {
2980
- "epoch": 2.24,
2981
- "grad_norm": 1.0203126239591027,
2982
- "learning_rate": 1.5272892290637892e-06,
2983
- "loss": 0.6749,
2984
- "step": 2120
2985
- },
2986
- {
2987
- "epoch": 2.25,
2988
- "grad_norm": 0.983643586611109,
2989
- "learning_rate": 1.5072277072531127e-06,
2990
- "loss": 0.6517,
2991
- "step": 2125
2992
- },
2993
- {
2994
- "epoch": 2.25,
2995
- "grad_norm": 1.0203957676102433,
2996
- "learning_rate": 1.4872754062585126e-06,
2997
- "loss": 0.6716,
2998
- "step": 2130
2999
- },
3000
- {
3001
- "epoch": 2.26,
3002
- "grad_norm": 1.036201909144992,
3003
- "learning_rate": 1.4674329499988737e-06,
3004
- "loss": 0.6574,
3005
- "step": 2135
3006
- },
3007
- {
3008
- "epoch": 2.26,
3009
- "grad_norm": 1.0277085537623492,
3010
- "learning_rate": 1.4477009589581787e-06,
3011
- "loss": 0.6593,
3012
- "step": 2140
3013
- },
3014
- {
3015
- "epoch": 2.27,
3016
- "grad_norm": 0.9713425669443266,
3017
- "learning_rate": 1.4280800501661057e-06,
3018
- "loss": 0.6621,
3019
- "step": 2145
3020
- },
3021
- {
3022
- "epoch": 2.27,
3023
- "grad_norm": 1.028497947768737,
3024
- "learning_rate": 1.408570837178735e-06,
3025
- "loss": 0.6656,
3026
- "step": 2150
3027
- },
3028
- {
3029
- "epoch": 2.28,
3030
- "grad_norm": 1.0565632370972053,
3031
- "learning_rate": 1.3891739300593559e-06,
3032
- "loss": 0.6644,
3033
- "step": 2155
3034
- },
3035
- {
3036
- "epoch": 2.28,
3037
- "grad_norm": 1.0043346444991121,
3038
- "learning_rate": 1.369889935359402e-06,
3039
- "loss": 0.6539,
3040
- "step": 2160
3041
- },
3042
- {
3043
- "epoch": 2.29,
3044
- "grad_norm": 1.0294689299797029,
3045
- "learning_rate": 1.3507194560994657e-06,
3046
- "loss": 0.6666,
3047
- "step": 2165
3048
- },
3049
- {
3050
- "epoch": 2.29,
3051
- "grad_norm": 1.0123495429792864,
3052
- "learning_rate": 1.331663091750463e-06,
3053
- "loss": 0.6928,
3054
- "step": 2170
3055
- },
3056
- {
3057
- "epoch": 2.3,
3058
- "grad_norm": 0.9951164224382856,
3059
- "learning_rate": 1.312721438214869e-06,
3060
- "loss": 0.6501,
3061
- "step": 2175
3062
- },
3063
- {
3064
- "epoch": 2.3,
3065
- "grad_norm": 1.025832661356824,
3066
- "learning_rate": 1.293895087808098e-06,
3067
- "loss": 0.6658,
3068
- "step": 2180
3069
- },
3070
- {
3071
- "epoch": 2.31,
3072
- "grad_norm": 0.9888366700648139,
3073
- "learning_rate": 1.2751846292399705e-06,
3074
- "loss": 0.6592,
3075
- "step": 2185
3076
- },
3077
- {
3078
- "epoch": 2.31,
3079
- "grad_norm": 1.0208359350524125,
3080
- "learning_rate": 1.2565906475963102e-06,
3081
- "loss": 0.6483,
3082
- "step": 2190
3083
- },
3084
- {
3085
- "epoch": 2.32,
3086
- "grad_norm": 1.0568986951058392,
3087
- "learning_rate": 1.2381137243206455e-06,
3088
- "loss": 0.6557,
3089
- "step": 2195
3090
- },
3091
- {
3092
- "epoch": 2.32,
3093
- "grad_norm": 0.9849389521844061,
3094
- "learning_rate": 1.2197544371960317e-06,
3095
- "loss": 0.6488,
3096
- "step": 2200
3097
- },
3098
- {
3099
- "epoch": 2.33,
3100
- "grad_norm": 1.0466426799607875,
3101
- "learning_rate": 1.2015133603269753e-06,
3102
- "loss": 0.6596,
3103
- "step": 2205
3104
- },
3105
- {
3106
- "epoch": 2.34,
3107
- "grad_norm": 0.9985742048846067,
3108
- "learning_rate": 1.183391064121493e-06,
3109
- "loss": 0.6572,
3110
- "step": 2210
3111
- },
3112
- {
3113
- "epoch": 2.34,
3114
- "grad_norm": 0.9661312369342807,
3115
- "learning_rate": 1.1653881152732582e-06,
3116
- "loss": 0.6439,
3117
- "step": 2215
3118
- },
3119
- {
3120
- "epoch": 2.35,
3121
- "grad_norm": 1.0327058718249167,
3122
- "learning_rate": 1.1475050767439e-06,
3123
- "loss": 0.6811,
3124
- "step": 2220
3125
- },
3126
- {
3127
- "epoch": 2.35,
3128
- "grad_norm": 1.0365200638536969,
3129
- "learning_rate": 1.129742507745382e-06,
3130
- "loss": 0.6588,
3131
- "step": 2225
3132
- },
3133
- {
3134
- "epoch": 2.36,
3135
- "grad_norm": 0.9804079029045045,
3136
- "learning_rate": 1.1121009637225283e-06,
3137
- "loss": 0.6783,
3138
- "step": 2230
3139
- },
3140
- {
3141
- "epoch": 2.36,
3142
- "grad_norm": 1.0326866018136251,
3143
- "learning_rate": 1.0945809963356442e-06,
3144
- "loss": 0.6705,
3145
- "step": 2235
3146
- },
3147
- {
3148
- "epoch": 2.37,
3149
- "grad_norm": 1.0314679157662048,
3150
- "learning_rate": 1.0771831534432714e-06,
3151
- "loss": 0.6353,
3152
- "step": 2240
3153
- },
3154
- {
3155
- "epoch": 2.37,
3156
- "grad_norm": 0.9589889108924486,
3157
- "learning_rate": 1.0599079790850542e-06,
3158
- "loss": 0.655,
3159
- "step": 2245
3160
- },
3161
- {
3162
- "epoch": 2.38,
3163
- "grad_norm": 0.9894914192305704,
3164
- "learning_rate": 1.0427560134647308e-06,
3165
- "loss": 0.643,
3166
- "step": 2250
3167
- },
3168
- {
3169
- "epoch": 2.38,
3170
- "grad_norm": 1.0693419775513076,
3171
- "learning_rate": 1.0257277929332332e-06,
3172
- "loss": 0.6611,
3173
- "step": 2255
3174
- },
3175
- {
3176
- "epoch": 2.39,
3177
- "grad_norm": 0.9951590219864285,
3178
- "learning_rate": 1.0088238499719254e-06,
3179
- "loss": 0.6403,
3180
- "step": 2260
3181
- },
3182
- {
3183
- "epoch": 2.39,
3184
- "grad_norm": 1.0105626202971048,
3185
- "learning_rate": 9.920447131759392e-07,
3186
- "loss": 0.6707,
3187
- "step": 2265
3188
- },
3189
- {
3190
- "epoch": 2.4,
3191
- "grad_norm": 1.0186289750333066,
3192
- "learning_rate": 9.753909072376594e-07,
3193
- "loss": 0.6809,
3194
- "step": 2270
3195
- },
3196
- {
3197
- "epoch": 2.4,
3198
- "grad_norm": 1.0267980845318398,
3199
- "learning_rate": 9.58862952930304e-07,
3200
- "loss": 0.6642,
3201
- "step": 2275
3202
- },
3203
- {
3204
- "epoch": 2.41,
3205
- "grad_norm": 1.0314667402705489,
3206
- "learning_rate": 9.424613670916499e-07,
3207
- "loss": 0.6815,
3208
- "step": 2280
3209
- },
3210
- {
3211
- "epoch": 2.41,
3212
- "grad_norm": 0.9818510396592551,
3213
- "learning_rate": 9.261866626078625e-07,
3214
- "loss": 0.6579,
3215
- "step": 2285
3216
- },
3217
- {
3218
- "epoch": 2.42,
3219
- "grad_norm": 0.998040916561116,
3220
- "learning_rate": 9.100393483974612e-07,
3221
- "loss": 0.6815,
3222
- "step": 2290
3223
- },
3224
- {
3225
- "epoch": 2.43,
3226
- "grad_norm": 1.007529165875462,
3227
- "learning_rate": 8.940199293954033e-07,
3228
- "loss": 0.6609,
3229
- "step": 2295
3230
- },
3231
- {
3232
- "epoch": 2.43,
3233
- "grad_norm": 1.0489165413908048,
3234
- "learning_rate": 8.781289065373016e-07,
3235
- "loss": 0.6661,
3236
- "step": 2300
3237
- },
3238
- {
3239
- "epoch": 2.44,
3240
- "grad_norm": 1.0586483881635766,
3241
- "learning_rate": 8.623667767437483e-07,
3242
- "loss": 0.6494,
3243
- "step": 2305
3244
- },
3245
- {
3246
- "epoch": 2.44,
3247
- "grad_norm": 0.970861929985865,
3248
- "learning_rate": 8.467340329047874e-07,
3249
- "loss": 0.6403,
3250
- "step": 2310
3251
- },
3252
- {
3253
- "epoch": 2.45,
3254
- "grad_norm": 1.0315170437890622,
3255
- "learning_rate": 8.312311638644888e-07,
3256
- "loss": 0.6802,
3257
- "step": 2315
3258
- },
3259
- {
3260
- "epoch": 2.45,
3261
- "grad_norm": 1.018615901485097,
3262
- "learning_rate": 8.158586544056791e-07,
3263
- "loss": 0.6813,
3264
- "step": 2320
3265
- },
3266
- {
3267
- "epoch": 2.46,
3268
- "grad_norm": 0.9991739019084611,
3269
- "learning_rate": 8.00616985234764e-07,
3270
- "loss": 0.6757,
3271
- "step": 2325
3272
- },
3273
- {
3274
- "epoch": 2.46,
3275
- "grad_norm": 1.039226698329409,
3276
- "learning_rate": 7.855066329667121e-07,
3277
- "loss": 0.6421,
3278
- "step": 2330
3279
- },
3280
- {
3281
- "epoch": 2.47,
3282
- "grad_norm": 1.0505394427255816,
3283
- "learning_rate": 7.705280701101392e-07,
3284
- "loss": 0.6655,
3285
- "step": 2335
3286
- },
3287
- {
3288
- "epoch": 2.47,
3289
- "grad_norm": 0.9750027460632938,
3290
- "learning_rate": 7.556817650525383e-07,
3291
- "loss": 0.6526,
3292
- "step": 2340
3293
- },
3294
- {
3295
- "epoch": 2.48,
3296
- "grad_norm": 0.989246982143368,
3297
- "learning_rate": 7.409681820456315e-07,
3298
- "loss": 0.667,
3299
- "step": 2345
3300
- },
3301
- {
3302
- "epoch": 2.48,
3303
- "grad_norm": 0.9977414734019189,
3304
- "learning_rate": 7.263877811908553e-07,
3305
- "loss": 0.6647,
3306
- "step": 2350
3307
- },
3308
- {
3309
- "epoch": 2.49,
3310
- "grad_norm": 0.9875292562685886,
3311
- "learning_rate": 7.11941018424967e-07,
3312
- "loss": 0.667,
3313
- "step": 2355
3314
- },
3315
- {
3316
- "epoch": 2.49,
3317
- "grad_norm": 0.9932801930288735,
3318
- "learning_rate": 6.97628345505797e-07,
3319
- "loss": 0.6511,
3320
- "step": 2360
3321
- },
3322
- {
3323
- "epoch": 2.5,
3324
- "grad_norm": 1.0199295886729471,
3325
- "learning_rate": 6.83450209998106e-07,
3326
- "loss": 0.6556,
3327
- "step": 2365
3328
- },
3329
- {
3330
- "epoch": 2.5,
3331
- "grad_norm": 1.0279710885988984,
3332
- "learning_rate": 6.694070552596105e-07,
3333
- "loss": 0.6676,
3334
- "step": 2370
3335
- },
3336
- {
3337
- "epoch": 2.51,
3338
- "grad_norm": 1.0221845787587531,
3339
- "learning_rate": 6.554993204270993e-07,
3340
- "loss": 0.6512,
3341
- "step": 2375
3342
- },
3343
- {
3344
- "epoch": 2.52,
3345
- "grad_norm": 0.9597530531552908,
3346
- "learning_rate": 6.417274404027163e-07,
3347
- "loss": 0.6482,
3348
- "step": 2380
3349
- },
3350
- {
3351
- "epoch": 2.52,
3352
- "grad_norm": 1.0201542647464452,
3353
- "learning_rate": 6.280918458403506e-07,
3354
- "loss": 0.6623,
3355
- "step": 2385
3356
- },
3357
- {
3358
- "epoch": 2.53,
3359
- "grad_norm": 0.9818765108255797,
3360
- "learning_rate": 6.14592963132174e-07,
3361
- "loss": 0.6599,
3362
- "step": 2390
3363
- },
3364
- {
3365
- "epoch": 2.53,
3366
- "grad_norm": 1.0020031777534095,
3367
- "learning_rate": 6.012312143953075e-07,
3368
- "loss": 0.6818,
3369
- "step": 2395
3370
- },
3371
- {
3372
- "epoch": 2.54,
3373
- "grad_norm": 1.020601700800406,
3374
- "learning_rate": 5.880070174586228e-07,
3375
- "loss": 0.6794,
3376
- "step": 2400
3377
- },
3378
- {
3379
- "epoch": 2.54,
3380
- "grad_norm": 0.9781529112263975,
3381
- "learning_rate": 5.74920785849673e-07,
3382
- "loss": 0.6612,
3383
- "step": 2405
3384
- },
3385
- {
3386
- "epoch": 2.55,
3387
- "grad_norm": 1.020456830272749,
3388
- "learning_rate": 5.619729287817621e-07,
3389
- "loss": 0.6638,
3390
- "step": 2410
3391
- },
3392
- {
3393
- "epoch": 2.55,
3394
- "grad_norm": 1.0134058298180835,
3395
- "learning_rate": 5.49163851141154e-07,
3396
- "loss": 0.6468,
3397
- "step": 2415
3398
- },
3399
- {
3400
- "epoch": 2.56,
3401
- "grad_norm": 1.0051724307379968,
3402
- "learning_rate": 5.36493953474404e-07,
3403
- "loss": 0.6411,
3404
- "step": 2420
3405
- },
3406
- {
3407
- "epoch": 2.56,
3408
- "grad_norm": 0.9963926377815217,
3409
- "learning_rate": 5.239636319758356e-07,
3410
- "loss": 0.668,
3411
- "step": 2425
3412
- },
3413
- {
3414
- "epoch": 2.57,
3415
- "grad_norm": 0.9731428272925532,
3416
- "learning_rate": 5.115732784751576e-07,
3417
- "loss": 0.6444,
3418
- "step": 2430
3419
- },
3420
- {
3421
- "epoch": 2.57,
3422
- "grad_norm": 1.0185774017291327,
3423
- "learning_rate": 4.993232804252018e-07,
3424
- "loss": 0.6529,
3425
- "step": 2435
3426
- },
3427
- {
3428
- "epoch": 2.58,
3429
- "grad_norm": 1.00711656230006,
3430
- "learning_rate": 4.872140208898118e-07,
3431
- "loss": 0.6539,
3432
- "step": 2440
3433
- },
3434
- {
3435
- "epoch": 2.58,
3436
- "grad_norm": 1.0045164786035452,
3437
- "learning_rate": 4.7524587853186866e-07,
3438
- "loss": 0.6629,
3439
- "step": 2445
3440
- },
3441
- {
3442
- "epoch": 2.59,
3443
- "grad_norm": 0.9961645157673277,
3444
- "learning_rate": 4.634192276014399e-07,
3445
- "loss": 0.6738,
3446
- "step": 2450
3447
- },
3448
- {
3449
- "epoch": 2.59,
3450
- "grad_norm": 1.0214318273829783,
3451
- "learning_rate": 4.5173443792408625e-07,
3452
- "loss": 0.6552,
3453
- "step": 2455
3454
- },
3455
- {
3456
- "epoch": 2.6,
3457
- "grad_norm": 1.0163355618069994,
3458
- "learning_rate": 4.4019187488928914e-07,
3459
- "loss": 0.6638,
3460
- "step": 2460
3461
- },
3462
- {
3463
- "epoch": 2.61,
3464
- "grad_norm": 1.032574771687925,
3465
- "learning_rate": 4.2879189943903335e-07,
3466
- "loss": 0.6877,
3467
- "step": 2465
3468
- },
3469
- {
3470
- "epoch": 2.61,
3471
- "grad_norm": 0.9930486578442914,
3472
- "learning_rate": 4.1753486805651e-07,
3473
- "loss": 0.6832,
3474
- "step": 2470
3475
- },
3476
- {
3477
- "epoch": 2.62,
3478
- "grad_norm": 0.969259241462703,
3479
- "learning_rate": 4.064211327549794e-07,
3480
- "loss": 0.6738,
3481
- "step": 2475
3482
- },
3483
- {
3484
- "epoch": 2.62,
3485
- "grad_norm": 1.018380412495952,
3486
- "learning_rate": 3.95451041066755e-07,
3487
- "loss": 0.671,
3488
- "step": 2480
3489
- },
3490
- {
3491
- "epoch": 2.63,
3492
- "grad_norm": 0.9735720562840744,
3493
- "learning_rate": 3.8462493603234064e-07,
3494
- "loss": 0.6433,
3495
- "step": 2485
3496
- },
3497
- {
3498
- "epoch": 2.63,
3499
- "grad_norm": 1.023935871901339,
3500
- "learning_rate": 3.739431561897011e-07,
3501
- "loss": 0.6593,
3502
- "step": 2490
3503
- },
3504
- {
3505
- "epoch": 2.64,
3506
- "grad_norm": 0.9931869209408388,
3507
- "learning_rate": 3.634060355636798e-07,
3508
- "loss": 0.6647,
3509
- "step": 2495
3510
- },
3511
- {
3512
- "epoch": 2.64,
3513
- "grad_norm": 1.0007736035504975,
3514
- "learning_rate": 3.53013903655548e-07,
3515
- "loss": 0.6683,
3516
- "step": 2500
3517
- },
3518
- {
3519
- "epoch": 2.65,
3520
- "grad_norm": 0.9926593135266999,
3521
- "learning_rate": 3.427670854327042e-07,
3522
- "loss": 0.6668,
3523
- "step": 2505
3524
- },
3525
- {
3526
- "epoch": 2.65,
3527
- "grad_norm": 0.9870259704326787,
3528
- "learning_rate": 3.3266590131851296e-07,
3529
- "loss": 0.6583,
3530
- "step": 2510
3531
- },
3532
- {
3533
- "epoch": 2.66,
3534
- "grad_norm": 1.0298553599069395,
3535
- "learning_rate": 3.227106671822849e-07,
3536
- "loss": 0.6835,
3537
- "step": 2515
3538
- },
3539
- {
3540
- "epoch": 2.66,
3541
- "grad_norm": 0.9915918166378904,
3542
- "learning_rate": 3.1290169432939556e-07,
3543
- "loss": 0.6428,
3544
- "step": 2520
3545
- },
3546
- {
3547
- "epoch": 2.67,
3548
- "grad_norm": 1.060474012796049,
3549
- "learning_rate": 3.03239289491557e-07,
3550
- "loss": 0.6571,
3551
- "step": 2525
3552
- },
3553
- {
3554
- "epoch": 2.67,
3555
- "grad_norm": 1.0203183687136719,
3556
- "learning_rate": 2.937237548172206e-07,
3557
- "loss": 0.6511,
3558
- "step": 2530
3559
- },
3560
- {
3561
- "epoch": 2.68,
3562
- "grad_norm": 0.989507237700814,
3563
- "learning_rate": 2.8435538786213134e-07,
3564
- "loss": 0.6746,
3565
- "step": 2535
3566
- },
3567
- {
3568
- "epoch": 2.68,
3569
- "grad_norm": 0.9853274639882493,
3570
- "learning_rate": 2.7513448158002334e-07,
3571
- "loss": 0.6657,
3572
- "step": 2540
3573
- },
3574
- {
3575
- "epoch": 2.69,
3576
- "grad_norm": 0.9957797339050202,
3577
- "learning_rate": 2.66061324313458e-07,
3578
- "loss": 0.6496,
3579
- "step": 2545
3580
- },
3581
- {
3582
- "epoch": 2.69,
3583
- "grad_norm": 1.0073836211394178,
3584
- "learning_rate": 2.5713619978480653e-07,
3585
- "loss": 0.6596,
3586
- "step": 2550
3587
- },
3588
- {
3589
- "epoch": 2.7,
3590
- "grad_norm": 0.9798969178233458,
3591
- "learning_rate": 2.483593870873829e-07,
3592
- "loss": 0.654,
3593
- "step": 2555
3594
- },
3595
- {
3596
- "epoch": 2.71,
3597
- "grad_norm": 0.9936847658098146,
3598
- "learning_rate": 2.3973116067670665e-07,
3599
- "loss": 0.6457,
3600
- "step": 2560
3601
- },
3602
- {
3603
- "epoch": 2.71,
3604
- "grad_norm": 1.0224466038654803,
3605
- "learning_rate": 2.3125179036193214e-07,
3606
- "loss": 0.6572,
3607
- "step": 2565
3608
- },
3609
- {
3610
- "epoch": 2.72,
3611
- "grad_norm": 1.0378183041017084,
3612
- "learning_rate": 2.2292154129740117e-07,
3613
- "loss": 0.6554,
3614
- "step": 2570
3615
- },
3616
- {
3617
- "epoch": 2.72,
3618
- "grad_norm": 0.9787357607930246,
3619
- "learning_rate": 2.147406739743596e-07,
3620
- "loss": 0.6689,
3621
- "step": 2575
3622
- },
3623
- {
3624
- "epoch": 2.73,
3625
- "grad_norm": 1.003947207260689,
3626
- "learning_rate": 2.0670944421280646e-07,
3627
- "loss": 0.6458,
3628
- "step": 2580
3629
- },
3630
- {
3631
- "epoch": 2.73,
3632
- "grad_norm": 1.0063190015667964,
3633
- "learning_rate": 1.9882810315349554e-07,
3634
- "loss": 0.6648,
3635
- "step": 2585
3636
- },
3637
- {
3638
- "epoch": 2.74,
3639
- "grad_norm": 1.0148103533053272,
3640
- "learning_rate": 1.9109689725008317e-07,
3641
- "loss": 0.6738,
3642
- "step": 2590
3643
- },
3644
- {
3645
- "epoch": 2.74,
3646
- "grad_norm": 1.0122729219524842,
3647
- "learning_rate": 1.8351606826142176e-07,
3648
- "loss": 0.6796,
3649
- "step": 2595
3650
- },
3651
- {
3652
- "epoch": 2.75,
3653
- "grad_norm": 1.0170129872933447,
3654
- "learning_rate": 1.7608585324399684e-07,
3655
- "loss": 0.6798,
3656
- "step": 2600
3657
- },
3658
- {
3659
- "epoch": 2.75,
3660
- "grad_norm": 0.992464215850126,
3661
- "learning_rate": 1.688064845445192e-07,
3662
- "loss": 0.6695,
3663
- "step": 2605
3664
- },
3665
- {
3666
- "epoch": 2.76,
3667
- "grad_norm": 0.9778375876093532,
3668
- "learning_rate": 1.6167818979265282e-07,
3669
- "loss": 0.6563,
3670
- "step": 2610
3671
- },
3672
- {
3673
- "epoch": 2.76,
3674
- "grad_norm": 1.0165595693382412,
3675
- "learning_rate": 1.5470119189390342e-07,
3676
- "loss": 0.6709,
3677
- "step": 2615
3678
- },
3679
- {
3680
- "epoch": 2.77,
3681
- "grad_norm": 0.9846854115443192,
3682
- "learning_rate": 1.4787570902264293e-07,
3683
- "loss": 0.6468,
3684
- "step": 2620
3685
- },
3686
- {
3687
- "epoch": 2.77,
3688
- "grad_norm": 1.0226129803358943,
3689
- "learning_rate": 1.4120195461529097e-07,
3690
- "loss": 0.6699,
3691
- "step": 2625
3692
- },
3693
- {
3694
- "epoch": 2.78,
3695
- "grad_norm": 1.0082916511837874,
3696
- "learning_rate": 1.3468013736363694e-07,
3697
- "loss": 0.6516,
3698
- "step": 2630
3699
- },
3700
- {
3701
- "epoch": 2.78,
3702
- "grad_norm": 1.0086534086914538,
3703
- "learning_rate": 1.2831046120831692e-07,
3704
- "loss": 0.6483,
3705
- "step": 2635
3706
- },
3707
- {
3708
- "epoch": 2.79,
3709
- "grad_norm": 0.9957571698657345,
3710
- "learning_rate": 1.2209312533243535e-07,
3711
- "loss": 0.6632,
3712
- "step": 2640
3713
- },
3714
- {
3715
- "epoch": 2.8,
3716
- "grad_norm": 1.0298383480420663,
3717
- "learning_rate": 1.1602832415533616e-07,
3718
- "loss": 0.6645,
3719
- "step": 2645
3720
- },
3721
- {
3722
- "epoch": 2.8,
3723
- "grad_norm": 1.0188314052602203,
3724
- "learning_rate": 1.1011624732652437e-07,
3725
- "loss": 0.6752,
3726
- "step": 2650
3727
- },
3728
- {
3729
- "epoch": 2.81,
3730
- "grad_norm": 1.0019681746822835,
3731
- "learning_rate": 1.0435707971973297e-07,
3732
- "loss": 0.6573,
3733
- "step": 2655
3734
- },
3735
- {
3736
- "epoch": 2.81,
3737
- "grad_norm": 0.9926022445477827,
3738
- "learning_rate": 9.875100142714478e-08,
3739
- "loss": 0.6396,
3740
- "step": 2660
3741
- },
3742
- {
3743
- "epoch": 2.82,
3744
- "grad_norm": 0.9847567872289796,
3745
- "learning_rate": 9.329818775376088e-08,
3746
- "loss": 0.672,
3747
- "step": 2665
3748
- },
3749
- {
3750
- "epoch": 2.82,
3751
- "grad_norm": 1.0103069579844817,
3752
- "learning_rate": 8.79988092119144e-08,
3753
- "loss": 0.678,
3754
- "step": 2670
3755
- },
3756
- {
3757
- "epoch": 2.83,
3758
- "grad_norm": 1.0092463732513441,
3759
- "learning_rate": 8.285303151594537e-08,
3760
- "loss": 0.6837,
3761
- "step": 2675
3762
- },
3763
- {
3764
- "epoch": 2.83,
3765
- "grad_norm": 1.0032753352403014,
3766
- "learning_rate": 7.786101557701209e-08,
3767
- "loss": 0.6494,
3768
- "step": 2680
3769
- },
3770
- {
3771
- "epoch": 2.84,
3772
- "grad_norm": 1.0278927407365124,
3773
- "learning_rate": 7.302291749806345e-08,
3774
- "loss": 0.6597,
3775
- "step": 2685
3776
- },
3777
- {
3778
- "epoch": 2.84,
3779
- "grad_norm": 0.9985234255556347,
3780
- "learning_rate": 6.833888856895676e-08,
3781
- "loss": 0.6672,
3782
- "step": 2690
3783
- },
3784
- {
3785
- "epoch": 2.85,
3786
- "grad_norm": 1.0086435046290338,
3787
- "learning_rate": 6.380907526172597e-08,
3788
- "loss": 0.6768,
3789
- "step": 2695
3790
- },
3791
- {
3792
- "epoch": 2.85,
3793
- "grad_norm": 0.9639413787477988,
3794
- "learning_rate": 5.943361922600255e-08,
3795
- "loss": 0.6346,
3796
- "step": 2700
3797
- },
3798
- {
3799
- "epoch": 2.86,
3800
- "grad_norm": 0.9898392259409212,
3801
- "learning_rate": 5.521265728458347e-08,
3802
- "loss": 0.6655,
3803
- "step": 2705
3804
- },
3805
- {
3806
- "epoch": 2.86,
3807
- "grad_norm": 1.0000733408715612,
3808
- "learning_rate": 5.114632142915687e-08,
3809
- "loss": 0.638,
3810
- "step": 2710
3811
- },
3812
- {
3813
- "epoch": 2.87,
3814
- "grad_norm": 0.990452054352071,
3815
- "learning_rate": 4.723473881617147e-08,
3816
- "loss": 0.6583,
3817
- "step": 2715
3818
- },
3819
- {
3820
- "epoch": 2.87,
3821
- "grad_norm": 0.988717000145255,
3822
- "learning_rate": 4.347803176286025e-08,
3823
- "loss": 0.6708,
3824
- "step": 2720
3825
- },
3826
- {
3827
- "epoch": 2.88,
3828
- "grad_norm": 0.9868081897157113,
3829
- "learning_rate": 3.98763177434186e-08,
3830
- "loss": 0.6583,
3831
- "step": 2725
3832
- },
3833
- {
3834
- "epoch": 2.89,
3835
- "grad_norm": 1.001603936622736,
3836
- "learning_rate": 3.642970938532553e-08,
3837
- "loss": 0.6754,
3838
- "step": 2730
3839
- },
3840
- {
3841
- "epoch": 2.89,
3842
- "grad_norm": 1.0028854813842756,
3843
- "learning_rate": 3.313831446582816e-08,
3844
- "loss": 0.6784,
3845
- "step": 2735
3846
- },
3847
- {
3848
- "epoch": 2.9,
3849
- "grad_norm": 0.9840591494137083,
3850
- "learning_rate": 3.000223590856666e-08,
3851
- "loss": 0.6651,
3852
- "step": 2740
3853
- },
3854
- {
3855
- "epoch": 2.9,
3856
- "grad_norm": 1.0425902900408417,
3857
- "learning_rate": 2.7021571780356804e-08,
3858
- "loss": 0.6489,
3859
- "step": 2745
3860
- },
3861
- {
3862
- "epoch": 2.91,
3863
- "grad_norm": 1.0016271763738829,
3864
- "learning_rate": 2.419641528812522e-08,
3865
- "loss": 0.6501,
3866
- "step": 2750
3867
- },
3868
- {
3869
- "epoch": 2.91,
3870
- "grad_norm": 0.9875844742537229,
3871
- "learning_rate": 2.1526854775992255e-08,
3872
- "loss": 0.667,
3873
- "step": 2755
3874
- },
3875
- {
3876
- "epoch": 2.92,
3877
- "grad_norm": 0.9909068409835267,
3878
- "learning_rate": 1.901297372251143e-08,
3879
- "loss": 0.6649,
3880
- "step": 2760
3881
- },
3882
- {
3883
- "epoch": 2.92,
3884
- "grad_norm": 1.0200770120528766,
3885
- "learning_rate": 1.665485073805817e-08,
3886
- "loss": 0.6542,
3887
- "step": 2765
3888
- },
3889
- {
3890
- "epoch": 2.93,
3891
- "grad_norm": 0.9699214260408161,
3892
- "learning_rate": 1.4452559562370683e-08,
3893
- "loss": 0.6644,
3894
- "step": 2770
3895
- },
3896
- {
3897
- "epoch": 2.93,
3898
- "grad_norm": 0.9962874170809767,
3899
- "learning_rate": 1.2406169062246232e-08,
3900
- "loss": 0.6502,
3901
- "step": 2775
3902
- },
3903
- {
3904
- "epoch": 2.94,
3905
- "grad_norm": 1.0264867036759864,
3906
- "learning_rate": 1.0515743229385645e-08,
3907
- "loss": 0.6698,
3908
- "step": 2780
3909
- },
3910
- {
3911
- "epoch": 2.94,
3912
- "grad_norm": 1.0133222133442825,
3913
- "learning_rate": 8.781341178393244e-09,
3914
- "loss": 0.6723,
3915
- "step": 2785
3916
- },
3917
- {
3918
- "epoch": 2.95,
3919
- "grad_norm": 1.0159129157737807,
3920
- "learning_rate": 7.203017144927771e-09,
3921
- "loss": 0.6561,
3922
- "step": 2790
3923
- },
3924
- {
3925
- "epoch": 2.95,
3926
- "grad_norm": 0.9931795490054022,
3927
- "learning_rate": 5.780820484007632e-09,
3928
- "loss": 0.6563,
3929
- "step": 2795
3930
- },
3931
- {
3932
- "epoch": 2.96,
3933
- "grad_norm": 1.0195254872888724,
3934
- "learning_rate": 4.514795668466576e-09,
3935
- "loss": 0.6808,
3936
- "step": 2800
3937
- },
3938
- {
3939
- "epoch": 2.96,
3940
- "grad_norm": 1.0210108366337896,
3941
- "learning_rate": 3.4049822875614757e-09,
3942
- "loss": 0.6723,
3943
- "step": 2805
3944
- },
3945
- {
3946
- "epoch": 2.97,
3947
- "grad_norm": 0.9891130306027911,
3948
- "learning_rate": 2.4514150457377594e-09,
3949
- "loss": 0.6763,
3950
- "step": 2810
3951
- },
3952
- {
3953
- "epoch": 2.97,
3954
- "grad_norm": 0.9876265686294937,
3955
- "learning_rate": 1.654123761541393e-09,
3956
- "loss": 0.6652,
3957
- "step": 2815
3958
- },
3959
- {
3960
- "epoch": 2.98,
3961
- "grad_norm": 0.9719073327336301,
3962
- "learning_rate": 1.0131333666885124e-09,
3963
- "loss": 0.6793,
3964
- "step": 2820
3965
- },
3966
- {
3967
- "epoch": 2.99,
3968
- "grad_norm": 1.004648101535836,
3969
- "learning_rate": 5.284639052832718e-10,
3970
- "loss": 0.6643,
3971
- "step": 2825
3972
- },
3973
- {
3974
- "epoch": 2.99,
3975
- "grad_norm": 1.0172517540637482,
3976
- "learning_rate": 2.0013053319334341e-10,
3977
- "loss": 0.6768,
3978
- "step": 2830
3979
- },
3980
- {
3981
- "epoch": 3.0,
3982
- "grad_norm": 0.9650966122076953,
3983
- "learning_rate": 2.814351757529643e-11,
3984
- "loss": 0.6356,
3985
- "step": 2835
3986
- },
3987
- {
3988
- "epoch": 3.0,
3989
- "step": 2838,
3990
- "total_flos": 1471706245890048.0,
3991
- "train_loss": 0.8058284866381398,
3992
- "train_runtime": 31310.8966,
3993
- "train_samples_per_second": 5.802,
3994
- "train_steps_per_second": 0.091
3995
- }
3996
- ],
3997
- "logging_steps": 5,
3998
- "max_steps": 2838,
3999
- "num_input_tokens_seen": 0,
4000
- "num_train_epochs": 3,
4001
- "save_steps": 500,
4002
- "total_flos": 1471706245890048.0,
4003
- "train_batch_size": 4,
4004
- "trial_name": null,
4005
- "trial_params": null
4006
- }