Alaa Aljabari
initial commit
b753e4c
raw
history blame
77.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 14.899028305872413,
"eval_steps": 500,
"global_step": 2205,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03379805661174482,
"grad_norm": 1.3233964443206787,
"learning_rate": 4.5248868778280546e-07,
"loss": 3.0504,
"step": 5
},
{
"epoch": 0.06759611322348964,
"grad_norm": 1.3710697889328003,
"learning_rate": 9.049773755656109e-07,
"loss": 3.1346,
"step": 10
},
{
"epoch": 0.10139416983523447,
"grad_norm": 1.6499178409576416,
"learning_rate": 1.3574660633484164e-06,
"loss": 3.0897,
"step": 15
},
{
"epoch": 0.13519222644697929,
"grad_norm": 1.1821593046188354,
"learning_rate": 1.8099547511312218e-06,
"loss": 3.0632,
"step": 20
},
{
"epoch": 0.16899028305872413,
"grad_norm": 1.2189937829971313,
"learning_rate": 2.2624434389140273e-06,
"loss": 3.1182,
"step": 25
},
{
"epoch": 0.20278833967046894,
"grad_norm": 1.5012513399124146,
"learning_rate": 2.7149321266968327e-06,
"loss": 2.9984,
"step": 30
},
{
"epoch": 0.23658639628221378,
"grad_norm": 1.459802508354187,
"learning_rate": 3.167420814479638e-06,
"loss": 3.141,
"step": 35
},
{
"epoch": 0.27038445289395857,
"grad_norm": 1.2144436836242676,
"learning_rate": 3.6199095022624436e-06,
"loss": 2.8889,
"step": 40
},
{
"epoch": 0.3041825095057034,
"grad_norm": 1.1169312000274658,
"learning_rate": 4.072398190045249e-06,
"loss": 2.8316,
"step": 45
},
{
"epoch": 0.33798056611744826,
"grad_norm": 2.0920190811157227,
"learning_rate": 4.5248868778280546e-06,
"loss": 2.8712,
"step": 50
},
{
"epoch": 0.3717786227291931,
"grad_norm": 2.2176926136016846,
"learning_rate": 4.97737556561086e-06,
"loss": 2.7896,
"step": 55
},
{
"epoch": 0.4055766793409379,
"grad_norm": 3.200904369354248,
"learning_rate": 5.4298642533936655e-06,
"loss": 2.7888,
"step": 60
},
{
"epoch": 0.4393747359526827,
"grad_norm": 1.8033865690231323,
"learning_rate": 5.882352941176471e-06,
"loss": 2.4676,
"step": 65
},
{
"epoch": 0.47317279256442757,
"grad_norm": 1.055401086807251,
"learning_rate": 6.334841628959276e-06,
"loss": 2.4417,
"step": 70
},
{
"epoch": 0.5069708491761724,
"grad_norm": 1.2879326343536377,
"learning_rate": 6.787330316742083e-06,
"loss": 2.4093,
"step": 75
},
{
"epoch": 0.5407689057879171,
"grad_norm": 1.1458905935287476,
"learning_rate": 7.239819004524887e-06,
"loss": 2.3892,
"step": 80
},
{
"epoch": 0.574566962399662,
"grad_norm": 1.0109455585479736,
"learning_rate": 7.692307692307694e-06,
"loss": 2.2264,
"step": 85
},
{
"epoch": 0.6083650190114068,
"grad_norm": 1.0762925148010254,
"learning_rate": 8.144796380090498e-06,
"loss": 2.1744,
"step": 90
},
{
"epoch": 0.6421630756231517,
"grad_norm": 1.1063696146011353,
"learning_rate": 8.597285067873304e-06,
"loss": 2.1335,
"step": 95
},
{
"epoch": 0.6759611322348965,
"grad_norm": 1.0435007810592651,
"learning_rate": 9.049773755656109e-06,
"loss": 2.0953,
"step": 100
},
{
"epoch": 0.7097591888466414,
"grad_norm": 1.2886987924575806,
"learning_rate": 9.502262443438914e-06,
"loss": 2.0159,
"step": 105
},
{
"epoch": 0.7435572454583862,
"grad_norm": 1.2186506986618042,
"learning_rate": 9.95475113122172e-06,
"loss": 1.9618,
"step": 110
},
{
"epoch": 0.7773553020701309,
"grad_norm": 1.1026384830474854,
"learning_rate": 1.0407239819004526e-05,
"loss": 1.9636,
"step": 115
},
{
"epoch": 0.8111533586818758,
"grad_norm": 1.2348583936691284,
"learning_rate": 1.0859728506787331e-05,
"loss": 1.9061,
"step": 120
},
{
"epoch": 0.8449514152936206,
"grad_norm": 1.2891064882278442,
"learning_rate": 1.1312217194570137e-05,
"loss": 1.8485,
"step": 125
},
{
"epoch": 0.8787494719053655,
"grad_norm": 1.1830450296401978,
"learning_rate": 1.1764705882352942e-05,
"loss": 1.8878,
"step": 130
},
{
"epoch": 0.9125475285171103,
"grad_norm": 1.3732470273971558,
"learning_rate": 1.2217194570135748e-05,
"loss": 1.8483,
"step": 135
},
{
"epoch": 0.9463455851288551,
"grad_norm": 1.3864206075668335,
"learning_rate": 1.2669683257918553e-05,
"loss": 1.6419,
"step": 140
},
{
"epoch": 0.9801436417405999,
"grad_norm": 1.529963493347168,
"learning_rate": 1.3122171945701359e-05,
"loss": 1.6966,
"step": 145
},
{
"epoch": 1.0135192226446978,
"grad_norm": 1.531701683998108,
"learning_rate": 1.3574660633484165e-05,
"loss": 1.7206,
"step": 150
},
{
"epoch": 1.0473172792564427,
"grad_norm": 1.2663397789001465,
"learning_rate": 1.4027149321266968e-05,
"loss": 1.5889,
"step": 155
},
{
"epoch": 1.0811153358681875,
"grad_norm": 1.1123305559158325,
"learning_rate": 1.4479638009049775e-05,
"loss": 1.5019,
"step": 160
},
{
"epoch": 1.1149133924799324,
"grad_norm": 1.5772109031677246,
"learning_rate": 1.4932126696832581e-05,
"loss": 1.5866,
"step": 165
},
{
"epoch": 1.1487114490916772,
"grad_norm": 1.6601964235305786,
"learning_rate": 1.5384615384615387e-05,
"loss": 1.4743,
"step": 170
},
{
"epoch": 1.182509505703422,
"grad_norm": 1.7369259595870972,
"learning_rate": 1.5837104072398192e-05,
"loss": 1.4523,
"step": 175
},
{
"epoch": 1.216307562315167,
"grad_norm": 1.5128813982009888,
"learning_rate": 1.6289592760180996e-05,
"loss": 1.5253,
"step": 180
},
{
"epoch": 1.2501056189269117,
"grad_norm": 1.5346794128417969,
"learning_rate": 1.6742081447963804e-05,
"loss": 1.5044,
"step": 185
},
{
"epoch": 1.2839036755386566,
"grad_norm": 1.4628633260726929,
"learning_rate": 1.719457013574661e-05,
"loss": 1.2012,
"step": 190
},
{
"epoch": 1.3177017321504014,
"grad_norm": 1.670690655708313,
"learning_rate": 1.7647058823529414e-05,
"loss": 1.3808,
"step": 195
},
{
"epoch": 1.3514997887621463,
"grad_norm": 1.7677953243255615,
"learning_rate": 1.8099547511312218e-05,
"loss": 1.2181,
"step": 200
},
{
"epoch": 1.385297845373891,
"grad_norm": 1.5957576036453247,
"learning_rate": 1.8552036199095026e-05,
"loss": 1.246,
"step": 205
},
{
"epoch": 1.419095901985636,
"grad_norm": 2.146883010864258,
"learning_rate": 1.9004524886877827e-05,
"loss": 1.247,
"step": 210
},
{
"epoch": 1.4528939585973806,
"grad_norm": 1.9484453201293945,
"learning_rate": 1.9457013574660635e-05,
"loss": 1.2372,
"step": 215
},
{
"epoch": 1.4866920152091254,
"grad_norm": 2.067981243133545,
"learning_rate": 1.990950226244344e-05,
"loss": 1.0892,
"step": 220
},
{
"epoch": 1.5204900718208703,
"grad_norm": 2.743396043777466,
"learning_rate": 1.9999799412001547e-05,
"loss": 1.0867,
"step": 225
},
{
"epoch": 1.554288128432615,
"grad_norm": 1.6774858236312866,
"learning_rate": 1.9998984537049476e-05,
"loss": 0.9716,
"step": 230
},
{
"epoch": 1.58808618504436,
"grad_norm": 2.1075453758239746,
"learning_rate": 1.9997542889433917e-05,
"loss": 1.0524,
"step": 235
},
{
"epoch": 1.6218842416561048,
"grad_norm": 2.1486105918884277,
"learning_rate": 1.9995474559522576e-05,
"loss": 0.996,
"step": 240
},
{
"epoch": 1.6556822982678496,
"grad_norm": 1.7104390859603882,
"learning_rate": 1.9992779676965884e-05,
"loss": 1.0356,
"step": 245
},
{
"epoch": 1.6894803548795945,
"grad_norm": 2.395637035369873,
"learning_rate": 1.9989458410688865e-05,
"loss": 1.0114,
"step": 250
},
{
"epoch": 1.7232784114913393,
"grad_norm": 2.096191644668579,
"learning_rate": 1.9985510968880555e-05,
"loss": 1.0029,
"step": 255
},
{
"epoch": 1.757076468103084,
"grad_norm": 3.1734085083007812,
"learning_rate": 1.9980937598980943e-05,
"loss": 0.9794,
"step": 260
},
{
"epoch": 1.790874524714829,
"grad_norm": 2.1142361164093018,
"learning_rate": 1.9975738587665455e-05,
"loss": 1.0681,
"step": 265
},
{
"epoch": 1.8246725813265736,
"grad_norm": 2.51534104347229,
"learning_rate": 1.996991426082701e-05,
"loss": 0.964,
"step": 270
},
{
"epoch": 1.8584706379383187,
"grad_norm": 2.2418782711029053,
"learning_rate": 1.9963464983555557e-05,
"loss": 0.9054,
"step": 275
},
{
"epoch": 1.8922686945500633,
"grad_norm": 2.073915958404541,
"learning_rate": 1.9956391160115224e-05,
"loss": 0.8698,
"step": 280
},
{
"epoch": 1.9260667511618081,
"grad_norm": 2.290095806121826,
"learning_rate": 1.994869323391895e-05,
"loss": 0.9558,
"step": 285
},
{
"epoch": 1.959864807773553,
"grad_norm": 2.205028533935547,
"learning_rate": 1.9940371687500713e-05,
"loss": 0.8184,
"step": 290
},
{
"epoch": 1.9936628643852978,
"grad_norm": 3.1502742767333984,
"learning_rate": 1.9931427042485252e-05,
"loss": 1.11,
"step": 295
},
{
"epoch": 2.0270384452893957,
"grad_norm": 2.171036958694458,
"learning_rate": 1.992185985955541e-05,
"loss": 0.7225,
"step": 300
},
{
"epoch": 2.0608365019011408,
"grad_norm": 2.4927501678466797,
"learning_rate": 1.991167073841695e-05,
"loss": 1.0667,
"step": 305
},
{
"epoch": 2.0946345585128854,
"grad_norm": 2.1560094356536865,
"learning_rate": 1.990086031776099e-05,
"loss": 0.7699,
"step": 310
},
{
"epoch": 2.1284326151246304,
"grad_norm": 2.2326228618621826,
"learning_rate": 1.9889429275223958e-05,
"loss": 0.8313,
"step": 315
},
{
"epoch": 2.162230671736375,
"grad_norm": 2.2837958335876465,
"learning_rate": 1.9877378327345115e-05,
"loss": 0.8124,
"step": 320
},
{
"epoch": 2.19602872834812,
"grad_norm": 2.033587694168091,
"learning_rate": 1.9864708229521637e-05,
"loss": 0.7758,
"step": 325
},
{
"epoch": 2.2298267849598647,
"grad_norm": 2.212913990020752,
"learning_rate": 1.9851419775961265e-05,
"loss": 0.6772,
"step": 330
},
{
"epoch": 2.26362484157161,
"grad_norm": 2.4761927127838135,
"learning_rate": 1.9837513799632536e-05,
"loss": 0.6488,
"step": 335
},
{
"epoch": 2.2974228981833544,
"grad_norm": 2.0889394283294678,
"learning_rate": 1.982299117221254e-05,
"loss": 0.6567,
"step": 340
},
{
"epoch": 2.331220954795099,
"grad_norm": 1.7731271982192993,
"learning_rate": 1.9807852804032306e-05,
"loss": 0.5775,
"step": 345
},
{
"epoch": 2.365019011406844,
"grad_norm": 2.9344234466552734,
"learning_rate": 1.979209964401973e-05,
"loss": 0.711,
"step": 350
},
{
"epoch": 2.3988170680185887,
"grad_norm": 2.7177066802978516,
"learning_rate": 1.9775732679640093e-05,
"loss": 0.5417,
"step": 355
},
{
"epoch": 2.432615124630334,
"grad_norm": 1.9662398099899292,
"learning_rate": 1.975875293683416e-05,
"loss": 0.7523,
"step": 360
},
{
"epoch": 2.4664131812420784,
"grad_norm": 2.3415379524230957,
"learning_rate": 1.9741161479953872e-05,
"loss": 0.5889,
"step": 365
},
{
"epoch": 2.5002112378538235,
"grad_norm": 2.181759834289551,
"learning_rate": 1.9722959411695636e-05,
"loss": 0.6306,
"step": 370
},
{
"epoch": 2.534009294465568,
"grad_norm": 2.549531936645508,
"learning_rate": 1.970414787303119e-05,
"loss": 0.7365,
"step": 375
},
{
"epoch": 2.567807351077313,
"grad_norm": 2.4441099166870117,
"learning_rate": 1.9684728043136093e-05,
"loss": 0.6466,
"step": 380
},
{
"epoch": 2.601605407689058,
"grad_norm": 2.6284027099609375,
"learning_rate": 1.966470113931582e-05,
"loss": 0.6229,
"step": 385
},
{
"epoch": 2.635403464300803,
"grad_norm": 2.808634042739868,
"learning_rate": 1.9644068416929417e-05,
"loss": 0.6366,
"step": 390
},
{
"epoch": 2.6692015209125475,
"grad_norm": 2.9602878093719482,
"learning_rate": 1.9622831169310864e-05,
"loss": 0.6766,
"step": 395
},
{
"epoch": 2.7029995775242925,
"grad_norm": 3.1014065742492676,
"learning_rate": 1.9600990727687964e-05,
"loss": 0.7399,
"step": 400
},
{
"epoch": 2.736797634136037,
"grad_norm": 1.8795526027679443,
"learning_rate": 1.9578548461098912e-05,
"loss": 0.635,
"step": 405
},
{
"epoch": 2.770595690747782,
"grad_norm": 2.7618279457092285,
"learning_rate": 1.9555505776306492e-05,
"loss": 0.6349,
"step": 410
},
{
"epoch": 2.804393747359527,
"grad_norm": 2.2114012241363525,
"learning_rate": 1.9531864117709855e-05,
"loss": 0.5364,
"step": 415
},
{
"epoch": 2.838191803971272,
"grad_norm": 2.5741758346557617,
"learning_rate": 1.950762496725403e-05,
"loss": 0.6202,
"step": 420
},
{
"epoch": 2.8719898605830165,
"grad_norm": 2.607922077178955,
"learning_rate": 1.948278984433699e-05,
"loss": 0.703,
"step": 425
},
{
"epoch": 2.905787917194761,
"grad_norm": 2.70082688331604,
"learning_rate": 1.945736030571443e-05,
"loss": 0.5793,
"step": 430
},
{
"epoch": 2.939585973806506,
"grad_norm": 2.543623447418213,
"learning_rate": 1.9431337945402186e-05,
"loss": 0.488,
"step": 435
},
{
"epoch": 2.973384030418251,
"grad_norm": 2.2461986541748047,
"learning_rate": 1.9404724394576305e-05,
"loss": 0.561,
"step": 440
},
{
"epoch": 3.006759611322349,
"grad_norm": 2.3420913219451904,
"learning_rate": 1.9377521321470806e-05,
"loss": 0.5422,
"step": 445
},
{
"epoch": 3.040557667934094,
"grad_norm": 2.485952854156494,
"learning_rate": 1.93497304312731e-05,
"loss": 0.4076,
"step": 450
},
{
"epoch": 3.0743557245458386,
"grad_norm": 2.928043842315674,
"learning_rate": 1.932135346601711e-05,
"loss": 0.4619,
"step": 455
},
{
"epoch": 3.1081537811575832,
"grad_norm": 3.641951084136963,
"learning_rate": 1.9292392204474075e-05,
"loss": 0.649,
"step": 460
},
{
"epoch": 3.1419518377693283,
"grad_norm": 2.47162127494812,
"learning_rate": 1.9262848462041046e-05,
"loss": 0.4297,
"step": 465
},
{
"epoch": 3.175749894381073,
"grad_norm": 2.943067789077759,
"learning_rate": 1.923272409062709e-05,
"loss": 0.5152,
"step": 470
},
{
"epoch": 3.209547950992818,
"grad_norm": 3.1264185905456543,
"learning_rate": 1.920202097853721e-05,
"loss": 0.5389,
"step": 475
},
{
"epoch": 3.2433460076045626,
"grad_norm": 2.739868402481079,
"learning_rate": 1.917074105035397e-05,
"loss": 0.5507,
"step": 480
},
{
"epoch": 3.2771440642163077,
"grad_norm": 2.510500907897949,
"learning_rate": 1.9138886266816868e-05,
"loss": 0.4332,
"step": 485
},
{
"epoch": 3.3109421208280523,
"grad_norm": 2.6104397773742676,
"learning_rate": 1.9106458624699425e-05,
"loss": 0.6674,
"step": 490
},
{
"epoch": 3.3447401774397973,
"grad_norm": 2.4239916801452637,
"learning_rate": 1.907346015668401e-05,
"loss": 0.4281,
"step": 495
},
{
"epoch": 3.378538234051542,
"grad_norm": 3.4318349361419678,
"learning_rate": 1.9039892931234434e-05,
"loss": 0.499,
"step": 500
},
{
"epoch": 3.412336290663287,
"grad_norm": 2.174170970916748,
"learning_rate": 1.9005759052466303e-05,
"loss": 0.415,
"step": 505
},
{
"epoch": 3.4461343472750317,
"grad_norm": 2.9727699756622314,
"learning_rate": 1.897106066001509e-05,
"loss": 0.5141,
"step": 510
},
{
"epoch": 3.4799324038867763,
"grad_norm": 3.2721505165100098,
"learning_rate": 1.8935799928902046e-05,
"loss": 0.5301,
"step": 515
},
{
"epoch": 3.5137304604985213,
"grad_norm": 2.574387311935425,
"learning_rate": 1.8899979069397858e-05,
"loss": 0.4762,
"step": 520
},
{
"epoch": 3.5475285171102664,
"grad_norm": 3.4737517833709717,
"learning_rate": 1.8863600326884085e-05,
"loss": 0.3734,
"step": 525
},
{
"epoch": 3.581326573722011,
"grad_norm": 3.07442045211792,
"learning_rate": 1.882666598171242e-05,
"loss": 0.6237,
"step": 530
},
{
"epoch": 3.6151246303337556,
"grad_norm": 2.256251573562622,
"learning_rate": 1.8789178349061755e-05,
"loss": 0.4447,
"step": 535
},
{
"epoch": 3.6489226869455007,
"grad_norm": 2.587085008621216,
"learning_rate": 1.8751139778793043e-05,
"loss": 0.4351,
"step": 540
},
{
"epoch": 3.6827207435572453,
"grad_norm": 2.929131507873535,
"learning_rate": 1.871255265530201e-05,
"loss": 0.4799,
"step": 545
},
{
"epoch": 3.7165188001689904,
"grad_norm": 2.4406521320343018,
"learning_rate": 1.8673419397369693e-05,
"loss": 0.3568,
"step": 550
},
{
"epoch": 3.750316856780735,
"grad_norm": 3.0470123291015625,
"learning_rate": 1.863374245801082e-05,
"loss": 0.535,
"step": 555
},
{
"epoch": 3.78411491339248,
"grad_norm": 1.8549753427505493,
"learning_rate": 1.8593524324320035e-05,
"loss": 0.3995,
"step": 560
},
{
"epoch": 3.8179129700042247,
"grad_norm": 3.1754539012908936,
"learning_rate": 1.855276751731602e-05,
"loss": 0.4495,
"step": 565
},
{
"epoch": 3.8517110266159698,
"grad_norm": 2.436633825302124,
"learning_rate": 1.8511474591783454e-05,
"loss": 0.4472,
"step": 570
},
{
"epoch": 3.8855090832277144,
"grad_norm": 2.15982985496521,
"learning_rate": 1.8469648136112867e-05,
"loss": 0.5069,
"step": 575
},
{
"epoch": 3.919307139839459,
"grad_norm": 2.9572179317474365,
"learning_rate": 1.8427290772138397e-05,
"loss": 0.4933,
"step": 580
},
{
"epoch": 3.953105196451204,
"grad_norm": 2.8051276206970215,
"learning_rate": 1.838440515497345e-05,
"loss": 0.39,
"step": 585
},
{
"epoch": 3.986903253062949,
"grad_norm": 3.173710823059082,
"learning_rate": 1.8340993972844252e-05,
"loss": 0.4061,
"step": 590
},
{
"epoch": 4.020278833967047,
"grad_norm": 2.855424404144287,
"learning_rate": 1.8297059946921357e-05,
"loss": 0.3861,
"step": 595
},
{
"epoch": 4.054076890578791,
"grad_norm": 2.7455055713653564,
"learning_rate": 1.8252605831149052e-05,
"loss": 0.3595,
"step": 600
},
{
"epoch": 4.087874947190537,
"grad_norm": 3.2334115505218506,
"learning_rate": 1.8207634412072765e-05,
"loss": 0.346,
"step": 605
},
{
"epoch": 4.1216730038022815,
"grad_norm": 2.810620069503784,
"learning_rate": 1.816214850866436e-05,
"loss": 0.4259,
"step": 610
},
{
"epoch": 4.155471060414026,
"grad_norm": 2.4240875244140625,
"learning_rate": 1.811615097214545e-05,
"loss": 0.4007,
"step": 615
},
{
"epoch": 4.189269117025771,
"grad_norm": 3.068871021270752,
"learning_rate": 1.8069644685808673e-05,
"loss": 0.2978,
"step": 620
},
{
"epoch": 4.223067173637516,
"grad_norm": 2.1824235916137695,
"learning_rate": 1.8022632564836948e-05,
"loss": 0.3693,
"step": 625
},
{
"epoch": 4.256865230249261,
"grad_norm": 3.2515642642974854,
"learning_rate": 1.797511755612075e-05,
"loss": 0.4717,
"step": 630
},
{
"epoch": 4.2906632868610055,
"grad_norm": 3.5332624912261963,
"learning_rate": 1.7927102638073384e-05,
"loss": 0.4488,
"step": 635
},
{
"epoch": 4.32446134347275,
"grad_norm": 2.8152706623077393,
"learning_rate": 1.7878590820444283e-05,
"loss": 0.3908,
"step": 640
},
{
"epoch": 4.358259400084495,
"grad_norm": 3.03226375579834,
"learning_rate": 1.7829585144130356e-05,
"loss": 0.3771,
"step": 645
},
{
"epoch": 4.39205745669624,
"grad_norm": 3.0809173583984375,
"learning_rate": 1.7780088680985365e-05,
"loss": 0.3708,
"step": 650
},
{
"epoch": 4.425855513307985,
"grad_norm": 3.259047269821167,
"learning_rate": 1.773010453362737e-05,
"loss": 0.4393,
"step": 655
},
{
"epoch": 4.4596535699197295,
"grad_norm": 2.542726993560791,
"learning_rate": 1.7679635835244256e-05,
"loss": 0.4462,
"step": 660
},
{
"epoch": 4.493451626531474,
"grad_norm": 2.5668067932128906,
"learning_rate": 1.762868574939732e-05,
"loss": 0.3585,
"step": 665
},
{
"epoch": 4.52724968314322,
"grad_norm": 2.9174365997314453,
"learning_rate": 1.7577257469822976e-05,
"loss": 0.3732,
"step": 670
},
{
"epoch": 4.561047739754964,
"grad_norm": 2.1858620643615723,
"learning_rate": 1.7525354220232558e-05,
"loss": 0.4202,
"step": 675
},
{
"epoch": 4.594845796366709,
"grad_norm": 3.092898368835449,
"learning_rate": 1.747297925411024e-05,
"loss": 0.4174,
"step": 680
},
{
"epoch": 4.6286438529784535,
"grad_norm": 2.1292641162872314,
"learning_rate": 1.742013585450911e-05,
"loss": 0.2891,
"step": 685
},
{
"epoch": 4.662441909590198,
"grad_norm": 3.4500226974487305,
"learning_rate": 1.736682733384536e-05,
"loss": 0.3446,
"step": 690
},
{
"epoch": 4.696239966201944,
"grad_norm": 2.490712881088257,
"learning_rate": 1.7313057033690662e-05,
"loss": 0.273,
"step": 695
},
{
"epoch": 4.730038022813688,
"grad_norm": 3.1903836727142334,
"learning_rate": 1.7258828324562705e-05,
"loss": 0.3976,
"step": 700
},
{
"epoch": 4.763836079425433,
"grad_norm": 2.6504249572753906,
"learning_rate": 1.7204144605713922e-05,
"loss": 0.351,
"step": 705
},
{
"epoch": 4.7976341360371775,
"grad_norm": 2.951176643371582,
"learning_rate": 1.7149009304918392e-05,
"loss": 0.3601,
"step": 710
},
{
"epoch": 4.831432192648923,
"grad_norm": 4.028046131134033,
"learning_rate": 1.7093425878257007e-05,
"loss": 0.4412,
"step": 715
},
{
"epoch": 4.865230249260668,
"grad_norm": 3.4209461212158203,
"learning_rate": 1.7037397809900807e-05,
"loss": 0.4239,
"step": 720
},
{
"epoch": 4.899028305872412,
"grad_norm": 2.396829605102539,
"learning_rate": 1.698092861189259e-05,
"loss": 0.3325,
"step": 725
},
{
"epoch": 4.932826362484157,
"grad_norm": 2.638688564300537,
"learning_rate": 1.6924021823926766e-05,
"loss": 0.3053,
"step": 730
},
{
"epoch": 4.966624419095902,
"grad_norm": 3.0459437370300293,
"learning_rate": 1.6866681013127466e-05,
"loss": 0.2785,
"step": 735
},
{
"epoch": 5.0,
"grad_norm": 4.051104545593262,
"learning_rate": 1.6808909773824952e-05,
"loss": 0.2148,
"step": 740
},
{
"epoch": 5.033798056611745,
"grad_norm": 2.424513816833496,
"learning_rate": 1.675071172733031e-05,
"loss": 0.3102,
"step": 745
},
{
"epoch": 5.067596113223489,
"grad_norm": 2.9347620010375977,
"learning_rate": 1.669209052170845e-05,
"loss": 0.2635,
"step": 750
},
{
"epoch": 5.101394169835235,
"grad_norm": 2.5299954414367676,
"learning_rate": 1.6633049831549424e-05,
"loss": 0.2556,
"step": 755
},
{
"epoch": 5.135192226446979,
"grad_norm": 3.3548402786254883,
"learning_rate": 1.657359335773812e-05,
"loss": 0.3626,
"step": 760
},
{
"epoch": 5.168990283058724,
"grad_norm": 3.0583834648132324,
"learning_rate": 1.6513724827222225e-05,
"loss": 0.3778,
"step": 765
},
{
"epoch": 5.202788339670469,
"grad_norm": 2.3884308338165283,
"learning_rate": 1.645344799277866e-05,
"loss": 0.3429,
"step": 770
},
{
"epoch": 5.236586396282214,
"grad_norm": 3.5502490997314453,
"learning_rate": 1.639276663277831e-05,
"loss": 0.3531,
"step": 775
},
{
"epoch": 5.270384452893959,
"grad_norm": 2.881547212600708,
"learning_rate": 1.6331684550949197e-05,
"loss": 0.2784,
"step": 780
},
{
"epoch": 5.304182509505703,
"grad_norm": 2.110593795776367,
"learning_rate": 1.627020557613803e-05,
"loss": 0.3011,
"step": 785
},
{
"epoch": 5.337980566117448,
"grad_norm": 3.2138075828552246,
"learning_rate": 1.6208333562070232e-05,
"loss": 0.3218,
"step": 790
},
{
"epoch": 5.3717786227291935,
"grad_norm": 2.4348948001861572,
"learning_rate": 1.614607238710833e-05,
"loss": 0.2419,
"step": 795
},
{
"epoch": 5.405576679340938,
"grad_norm": 3.6023876667022705,
"learning_rate": 1.6083425954008883e-05,
"loss": 0.3198,
"step": 800
},
{
"epoch": 5.439374735952683,
"grad_norm": 3.171356201171875,
"learning_rate": 1.602039818967783e-05,
"loss": 0.3377,
"step": 805
},
{
"epoch": 5.473172792564427,
"grad_norm": 2.926022529602051,
"learning_rate": 1.5956993044924334e-05,
"loss": 0.2398,
"step": 810
},
{
"epoch": 5.506970849176172,
"grad_norm": 2.8738198280334473,
"learning_rate": 1.589321449421313e-05,
"loss": 0.2829,
"step": 815
},
{
"epoch": 5.5407689057879175,
"grad_norm": 3.6972992420196533,
"learning_rate": 1.5829066535415402e-05,
"loss": 0.3569,
"step": 820
},
{
"epoch": 5.574566962399662,
"grad_norm": 3.0152523517608643,
"learning_rate": 1.576455318955816e-05,
"loss": 0.2925,
"step": 825
},
{
"epoch": 5.608365019011407,
"grad_norm": 2.8930368423461914,
"learning_rate": 1.569967850057222e-05,
"loss": 0.3363,
"step": 830
},
{
"epoch": 5.642163075623151,
"grad_norm": 3.1284563541412354,
"learning_rate": 1.5634446535038688e-05,
"loss": 0.3218,
"step": 835
},
{
"epoch": 5.675961132234897,
"grad_norm": 1.6916499137878418,
"learning_rate": 1.556886138193406e-05,
"loss": 0.2436,
"step": 840
},
{
"epoch": 5.7097591888466415,
"grad_norm": 3.7334420680999756,
"learning_rate": 1.5502927152373913e-05,
"loss": 0.2874,
"step": 845
},
{
"epoch": 5.743557245458386,
"grad_norm": 3.914621591567993,
"learning_rate": 1.5436647979355214e-05,
"loss": 0.2329,
"step": 850
},
{
"epoch": 5.777355302070131,
"grad_norm": 3.38970685005188,
"learning_rate": 1.5370028017497217e-05,
"loss": 0.3232,
"step": 855
},
{
"epoch": 5.811153358681876,
"grad_norm": 2.7700934410095215,
"learning_rate": 1.5303071442781083e-05,
"loss": 0.2951,
"step": 860
},
{
"epoch": 5.844951415293621,
"grad_norm": 3.382173538208008,
"learning_rate": 1.5235782452288068e-05,
"loss": 0.2719,
"step": 865
},
{
"epoch": 5.8787494719053655,
"grad_norm": 3.8175547122955322,
"learning_rate": 1.5168165263936472e-05,
"loss": 0.3171,
"step": 870
},
{
"epoch": 5.91254752851711,
"grad_norm": 3.3271560668945312,
"learning_rate": 1.5100224116217217e-05,
"loss": 0.2364,
"step": 875
},
{
"epoch": 5.946345585128855,
"grad_norm": 2.9731876850128174,
"learning_rate": 1.5031963267928185e-05,
"loss": 0.2103,
"step": 880
},
{
"epoch": 5.9801436417406,
"grad_norm": 3.461787700653076,
"learning_rate": 1.4963386997907242e-05,
"loss": 0.341,
"step": 885
},
{
"epoch": 6.013519222644698,
"grad_norm": 3.172473669052124,
"learning_rate": 1.4894499604764035e-05,
"loss": 0.2618,
"step": 890
},
{
"epoch": 6.0473172792564425,
"grad_norm": 2.9784677028656006,
"learning_rate": 1.4825305406610547e-05,
"loss": 0.2903,
"step": 895
},
{
"epoch": 6.081115335868188,
"grad_norm": 3.697354555130005,
"learning_rate": 1.4755808740790403e-05,
"loss": 0.2625,
"step": 900
},
{
"epoch": 6.114913392479933,
"grad_norm": 3.192431926727295,
"learning_rate": 1.4686013963607e-05,
"loss": 0.233,
"step": 905
},
{
"epoch": 6.148711449091677,
"grad_norm": 2.8318302631378174,
"learning_rate": 1.4615925450050448e-05,
"loss": 0.1387,
"step": 910
},
{
"epoch": 6.182509505703422,
"grad_norm": 3.418325901031494,
"learning_rate": 1.4545547593523308e-05,
"loss": 0.3177,
"step": 915
},
{
"epoch": 6.2163075623151665,
"grad_norm": 3.188663959503174,
"learning_rate": 1.4474884805565217e-05,
"loss": 0.2066,
"step": 920
},
{
"epoch": 6.250105618926912,
"grad_norm": 2.2658884525299072,
"learning_rate": 1.4403941515576344e-05,
"loss": 0.2959,
"step": 925
},
{
"epoch": 6.283903675538657,
"grad_norm": 2.798861265182495,
"learning_rate": 1.4332722170539748e-05,
"loss": 0.2784,
"step": 930
},
{
"epoch": 6.317701732150401,
"grad_norm": 3.2030510902404785,
"learning_rate": 1.4261231234742618e-05,
"loss": 0.224,
"step": 935
},
{
"epoch": 6.351499788762146,
"grad_norm": 3.1087892055511475,
"learning_rate": 1.4189473189496437e-05,
"loss": 0.271,
"step": 940
},
{
"epoch": 6.385297845373891,
"grad_norm": 3.4298338890075684,
"learning_rate": 1.4117452532856084e-05,
"loss": 0.1972,
"step": 945
},
{
"epoch": 6.419095901985636,
"grad_norm": 2.693760633468628,
"learning_rate": 1.4045173779337866e-05,
"loss": 0.3036,
"step": 950
},
{
"epoch": 6.452893958597381,
"grad_norm": 3.6742842197418213,
"learning_rate": 1.3972641459636548e-05,
"loss": 0.276,
"step": 955
},
{
"epoch": 6.486692015209125,
"grad_norm": 2.9099996089935303,
"learning_rate": 1.3899860120341338e-05,
"loss": 0.2841,
"step": 960
},
{
"epoch": 6.52049007182087,
"grad_norm": 2.4859213829040527,
"learning_rate": 1.3826834323650899e-05,
"loss": 0.2752,
"step": 965
},
{
"epoch": 6.554288128432615,
"grad_norm": 2.6533761024475098,
"learning_rate": 1.3753568647087372e-05,
"loss": 0.212,
"step": 970
},
{
"epoch": 6.58808618504436,
"grad_norm": 2.8711912631988525,
"learning_rate": 1.3680067683209438e-05,
"loss": 0.2039,
"step": 975
},
{
"epoch": 6.621884241656105,
"grad_norm": 3.615388870239258,
"learning_rate": 1.3606336039324439e-05,
"loss": 0.1882,
"step": 980
},
{
"epoch": 6.65568229826785,
"grad_norm": 2.813685655593872,
"learning_rate": 1.353237833719958e-05,
"loss": 0.2237,
"step": 985
},
{
"epoch": 6.689480354879595,
"grad_norm": 3.288862466812134,
"learning_rate": 1.3458199212772227e-05,
"loss": 0.2177,
"step": 990
},
{
"epoch": 6.723278411491339,
"grad_norm": 3.3833813667297363,
"learning_rate": 1.3383803315859281e-05,
"loss": 0.2406,
"step": 995
},
{
"epoch": 6.757076468103084,
"grad_norm": 3.7307562828063965,
"learning_rate": 1.3309195309865746e-05,
"loss": 0.1924,
"step": 1000
},
{
"epoch": 6.7908745247148286,
"grad_norm": 3.9301440715789795,
"learning_rate": 1.3234379871492381e-05,
"loss": 0.2912,
"step": 1005
},
{
"epoch": 6.824672581326574,
"grad_norm": 1.9294644594192505,
"learning_rate": 1.315936169044257e-05,
"loss": 0.2257,
"step": 1010
},
{
"epoch": 6.858470637938319,
"grad_norm": 3.4223814010620117,
"learning_rate": 1.3084145469128343e-05,
"loss": 0.2205,
"step": 1015
},
{
"epoch": 6.892268694550063,
"grad_norm": 3.395117998123169,
"learning_rate": 1.3008735922375607e-05,
"loss": 0.2059,
"step": 1020
},
{
"epoch": 6.926066751161808,
"grad_norm": 3.7277326583862305,
"learning_rate": 1.2933137777128607e-05,
"loss": 0.2599,
"step": 1025
},
{
"epoch": 6.9598648077735525,
"grad_norm": 2.926193952560425,
"learning_rate": 1.2857355772153637e-05,
"loss": 0.2058,
"step": 1030
},
{
"epoch": 6.993662864385298,
"grad_norm": 2.551806926727295,
"learning_rate": 1.2781394657741988e-05,
"loss": 0.3004,
"step": 1035
},
{
"epoch": 7.027038445289396,
"grad_norm": 3.521486759185791,
"learning_rate": 1.2705259195412168e-05,
"loss": 0.1499,
"step": 1040
},
{
"epoch": 7.06083650190114,
"grad_norm": 3.246941089630127,
"learning_rate": 1.2628954157611449e-05,
"loss": 0.2174,
"step": 1045
},
{
"epoch": 7.094634558512886,
"grad_norm": 2.2454280853271484,
"learning_rate": 1.255248432741672e-05,
"loss": 0.1209,
"step": 1050
},
{
"epoch": 7.1284326151246304,
"grad_norm": 2.4737725257873535,
"learning_rate": 1.2475854498234647e-05,
"loss": 0.1727,
"step": 1055
},
{
"epoch": 7.162230671736375,
"grad_norm": 2.819976329803467,
"learning_rate": 1.239906947350121e-05,
"loss": 0.2555,
"step": 1060
},
{
"epoch": 7.19602872834812,
"grad_norm": 2.772263765335083,
"learning_rate": 1.2322134066380622e-05,
"loss": 0.2112,
"step": 1065
},
{
"epoch": 7.229826784959865,
"grad_norm": 3.721599817276001,
"learning_rate": 1.22450530994636e-05,
"loss": 0.3326,
"step": 1070
},
{
"epoch": 7.26362484157161,
"grad_norm": 2.8285434246063232,
"learning_rate": 1.2167831404465078e-05,
"loss": 0.2237,
"step": 1075
},
{
"epoch": 7.297422898183354,
"grad_norm": 3.2905073165893555,
"learning_rate": 1.2090473821921343e-05,
"loss": 0.1998,
"step": 1080
},
{
"epoch": 7.331220954795099,
"grad_norm": 2.5703885555267334,
"learning_rate": 1.2012985200886602e-05,
"loss": 0.2402,
"step": 1085
},
{
"epoch": 7.365019011406844,
"grad_norm": 3.2286860942840576,
"learning_rate": 1.1935370398629033e-05,
"loss": 0.1771,
"step": 1090
},
{
"epoch": 7.398817068018589,
"grad_norm": 3.355846881866455,
"learning_rate": 1.185763428032631e-05,
"loss": 0.2184,
"step": 1095
},
{
"epoch": 7.432615124630334,
"grad_norm": 2.6862475872039795,
"learning_rate": 1.1779781718760641e-05,
"loss": 0.212,
"step": 1100
},
{
"epoch": 7.466413181242078,
"grad_norm": 3.8962576389312744,
"learning_rate": 1.1701817594013312e-05,
"loss": 0.214,
"step": 1105
},
{
"epoch": 7.500211237853823,
"grad_norm": 3.2958405017852783,
"learning_rate": 1.1623746793158803e-05,
"loss": 0.2378,
"step": 1110
},
{
"epoch": 7.5340092944655686,
"grad_norm": 2.6480026245117188,
"learning_rate": 1.1545574209958433e-05,
"loss": 0.1399,
"step": 1115
},
{
"epoch": 7.567807351077313,
"grad_norm": 3.944840669631958,
"learning_rate": 1.1467304744553618e-05,
"loss": 0.2823,
"step": 1120
},
{
"epoch": 7.601605407689058,
"grad_norm": 4.2091498374938965,
"learning_rate": 1.1388943303158692e-05,
"loss": 0.1703,
"step": 1125
},
{
"epoch": 7.635403464300802,
"grad_norm": 4.504730701446533,
"learning_rate": 1.1310494797753382e-05,
"loss": 0.1969,
"step": 1130
},
{
"epoch": 7.669201520912548,
"grad_norm": 3.6243932247161865,
"learning_rate": 1.1231964145774906e-05,
"loss": 0.2886,
"step": 1135
},
{
"epoch": 7.7029995775242925,
"grad_norm": 3.16015887260437,
"learning_rate": 1.1153356269809721e-05,
"loss": 0.1156,
"step": 1140
},
{
"epoch": 7.736797634136037,
"grad_norm": 3.0954883098602295,
"learning_rate": 1.1074676097284973e-05,
"loss": 0.1634,
"step": 1145
},
{
"epoch": 7.770595690747782,
"grad_norm": 3.1873254776000977,
"learning_rate": 1.0995928560159608e-05,
"loss": 0.2507,
"step": 1150
},
{
"epoch": 7.804393747359526,
"grad_norm": 3.6099650859832764,
"learning_rate": 1.0917118594615237e-05,
"loss": 0.2474,
"step": 1155
},
{
"epoch": 7.838191803971272,
"grad_norm": 3.4526472091674805,
"learning_rate": 1.0838251140746717e-05,
"loss": 0.1501,
"step": 1160
},
{
"epoch": 7.8719898605830165,
"grad_norm": 2.2834644317626953,
"learning_rate": 1.0759331142252463e-05,
"loss": 0.1648,
"step": 1165
},
{
"epoch": 7.905787917194761,
"grad_norm": 3.0223686695098877,
"learning_rate": 1.0680363546124599e-05,
"loss": 0.1598,
"step": 1170
},
{
"epoch": 7.939585973806506,
"grad_norm": 3.2281494140625,
"learning_rate": 1.060135330233883e-05,
"loss": 0.1681,
"step": 1175
},
{
"epoch": 7.973384030418251,
"grad_norm": 3.3291306495666504,
"learning_rate": 1.0522305363544172e-05,
"loss": 0.1202,
"step": 1180
},
{
"epoch": 8.00675961132235,
"grad_norm": 2.6950342655181885,
"learning_rate": 1.04432246847525e-05,
"loss": 0.2243,
"step": 1185
},
{
"epoch": 8.040557667934094,
"grad_norm": 3.4718968868255615,
"learning_rate": 1.0364116223027956e-05,
"loss": 0.1996,
"step": 1190
},
{
"epoch": 8.074355724545839,
"grad_norm": 3.3445370197296143,
"learning_rate": 1.0284984937176213e-05,
"loss": 0.2244,
"step": 1195
},
{
"epoch": 8.108153781157583,
"grad_norm": 2.8722851276397705,
"learning_rate": 1.0205835787433645e-05,
"loss": 0.099,
"step": 1200
},
{
"epoch": 8.141951837769328,
"grad_norm": 2.5152461528778076,
"learning_rate": 1.0126673735156402e-05,
"loss": 0.1599,
"step": 1205
},
{
"epoch": 8.175749894381074,
"grad_norm": 3.2663590908050537,
"learning_rate": 1.0047503742509405e-05,
"loss": 0.2148,
"step": 1210
},
{
"epoch": 8.209547950992818,
"grad_norm": 2.693246603012085,
"learning_rate": 9.968330772155312e-06,
"loss": 0.219,
"step": 1215
},
{
"epoch": 8.243346007604563,
"grad_norm": 3.533890962600708,
"learning_rate": 9.889159786943428e-06,
"loss": 0.1133,
"step": 1220
},
{
"epoch": 8.277144064216307,
"grad_norm": 2.7618963718414307,
"learning_rate": 9.809995749598633e-06,
"loss": 0.1692,
"step": 1225
},
{
"epoch": 8.310942120828052,
"grad_norm": 2.682603120803833,
"learning_rate": 9.730843622410282e-06,
"loss": 0.2291,
"step": 1230
},
{
"epoch": 8.344740177439798,
"grad_norm": 2.9029886722564697,
"learning_rate": 9.651708366921152e-06,
"loss": 0.165,
"step": 1235
},
{
"epoch": 8.378538234051542,
"grad_norm": 2.932377576828003,
"learning_rate": 9.572594943616457e-06,
"loss": 0.1651,
"step": 1240
},
{
"epoch": 8.412336290663287,
"grad_norm": 3.0703186988830566,
"learning_rate": 9.493508311612874e-06,
"loss": 0.1969,
"step": 1245
},
{
"epoch": 8.446134347275033,
"grad_norm": 2.8268532752990723,
"learning_rate": 9.414453428347715e-06,
"loss": 0.1747,
"step": 1250
},
{
"epoch": 8.479932403886776,
"grad_norm": 2.9563803672790527,
"learning_rate": 9.335435249268165e-06,
"loss": 0.1082,
"step": 1255
},
{
"epoch": 8.513730460498522,
"grad_norm": 3.163346767425537,
"learning_rate": 9.256458727520648e-06,
"loss": 0.1776,
"step": 1260
},
{
"epoch": 8.547528517110266,
"grad_norm": 3.5345945358276367,
"learning_rate": 9.177528813640362e-06,
"loss": 0.1194,
"step": 1265
},
{
"epoch": 8.581326573722011,
"grad_norm": 3.074373722076416,
"learning_rate": 9.098650455240959e-06,
"loss": 0.197,
"step": 1270
},
{
"epoch": 8.615124630333757,
"grad_norm": 3.080812454223633,
"learning_rate": 9.019828596704394e-06,
"loss": 0.1218,
"step": 1275
},
{
"epoch": 8.6489226869455,
"grad_norm": 3.2213311195373535,
"learning_rate": 8.941068178871021e-06,
"loss": 0.1822,
"step": 1280
},
{
"epoch": 8.682720743557246,
"grad_norm": 2.857954740524292,
"learning_rate": 8.862374138729854e-06,
"loss": 0.1687,
"step": 1285
},
{
"epoch": 8.71651880016899,
"grad_norm": 2.9493982791900635,
"learning_rate": 8.783751409109116e-06,
"loss": 0.1393,
"step": 1290
},
{
"epoch": 8.750316856780735,
"grad_norm": 1.754936695098877,
"learning_rate": 8.705204918367032e-06,
"loss": 0.1846,
"step": 1295
},
{
"epoch": 8.78411491339248,
"grad_norm": 4.011746406555176,
"learning_rate": 8.626739590082897e-06,
"loss": 0.1897,
"step": 1300
},
{
"epoch": 8.817912970004224,
"grad_norm": 3.003286361694336,
"learning_rate": 8.54836034274844e-06,
"loss": 0.1873,
"step": 1305
},
{
"epoch": 8.85171102661597,
"grad_norm": 3.0416910648345947,
"learning_rate": 8.47007208945953e-06,
"loss": 0.1263,
"step": 1310
},
{
"epoch": 8.885509083227713,
"grad_norm": 3.4020864963531494,
"learning_rate": 8.391879737608202e-06,
"loss": 0.1536,
"step": 1315
},
{
"epoch": 8.919307139839459,
"grad_norm": 2.8439645767211914,
"learning_rate": 8.313788188575032e-06,
"loss": 0.1835,
"step": 1320
},
{
"epoch": 8.953105196451205,
"grad_norm": 2.475952386856079,
"learning_rate": 8.23580233742192e-06,
"loss": 0.1275,
"step": 1325
},
{
"epoch": 8.986903253062948,
"grad_norm": 3.099142551422119,
"learning_rate": 8.15792707258522e-06,
"loss": 0.1355,
"step": 1330
},
{
"epoch": 9.020278833967048,
"grad_norm": 2.5242106914520264,
"learning_rate": 8.08016727556936e-06,
"loss": 0.1135,
"step": 1335
},
{
"epoch": 9.054076890578791,
"grad_norm": 2.4750607013702393,
"learning_rate": 8.002527820640809e-06,
"loss": 0.1477,
"step": 1340
},
{
"epoch": 9.087874947190537,
"grad_norm": 2.5990304946899414,
"learning_rate": 7.925013574522556e-06,
"loss": 0.1125,
"step": 1345
},
{
"epoch": 9.12167300380228,
"grad_norm": 2.2538115978240967,
"learning_rate": 7.847629396089054e-06,
"loss": 0.1967,
"step": 1350
},
{
"epoch": 9.155471060414026,
"grad_norm": 2.93662691116333,
"learning_rate": 7.770380136061643e-06,
"loss": 0.1963,
"step": 1355
},
{
"epoch": 9.189269117025772,
"grad_norm": 3.2367334365844727,
"learning_rate": 7.693270636704476e-06,
"loss": 0.0882,
"step": 1360
},
{
"epoch": 9.223067173637515,
"grad_norm": 2.297624349594116,
"learning_rate": 7.616305731521009e-06,
"loss": 0.1547,
"step": 1365
},
{
"epoch": 9.256865230249261,
"grad_norm": 3.3643083572387695,
"learning_rate": 7.539490244951013e-06,
"loss": 0.1491,
"step": 1370
},
{
"epoch": 9.290663286861005,
"grad_norm": 2.270787477493286,
"learning_rate": 7.462828992068144e-06,
"loss": 0.1255,
"step": 1375
},
{
"epoch": 9.32446134347275,
"grad_norm": 2.6333799362182617,
"learning_rate": 7.386326778278142e-06,
"loss": 0.1117,
"step": 1380
},
{
"epoch": 9.358259400084496,
"grad_norm": 2.613737106323242,
"learning_rate": 7.3099883990176025e-06,
"loss": 0.1612,
"step": 1385
},
{
"epoch": 9.39205745669624,
"grad_norm": 1.7052559852600098,
"learning_rate": 7.233818639453358e-06,
"loss": 0.1471,
"step": 1390
},
{
"epoch": 9.425855513307985,
"grad_norm": 3.2761054039001465,
"learning_rate": 7.15782227418257e-06,
"loss": 0.122,
"step": 1395
},
{
"epoch": 9.45965356991973,
"grad_norm": 2.652831792831421,
"learning_rate": 7.0820040669333975e-06,
"loss": 0.1438,
"step": 1400
},
{
"epoch": 9.493451626531474,
"grad_norm": 3.1051905155181885,
"learning_rate": 7.006368770266421e-06,
"loss": 0.1396,
"step": 1405
},
{
"epoch": 9.52724968314322,
"grad_norm": 2.8987197875976562,
"learning_rate": 6.930921125276715e-06,
"loss": 0.1714,
"step": 1410
},
{
"epoch": 9.561047739754963,
"grad_norm": 3.5985753536224365,
"learning_rate": 6.855665861296662e-06,
"loss": 0.1221,
"step": 1415
},
{
"epoch": 9.594845796366709,
"grad_norm": 3.5496666431427,
"learning_rate": 6.78060769559951e-06,
"loss": 0.1261,
"step": 1420
},
{
"epoch": 9.628643852978454,
"grad_norm": 2.57647442817688,
"learning_rate": 6.705751333103676e-06,
"loss": 0.132,
"step": 1425
},
{
"epoch": 9.662441909590198,
"grad_norm": 2.8501367568969727,
"learning_rate": 6.631101466077801e-06,
"loss": 0.1463,
"step": 1430
},
{
"epoch": 9.696239966201944,
"grad_norm": 2.449470043182373,
"learning_rate": 6.556662773846658e-06,
"loss": 0.1387,
"step": 1435
},
{
"epoch": 9.730038022813687,
"grad_norm": 3.8504765033721924,
"learning_rate": 6.48243992249781e-06,
"loss": 0.1906,
"step": 1440
},
{
"epoch": 9.763836079425433,
"grad_norm": 2.4857442378997803,
"learning_rate": 6.40843756458913e-06,
"loss": 0.1024,
"step": 1445
},
{
"epoch": 9.797634136037178,
"grad_norm": 3.8078644275665283,
"learning_rate": 6.3346603388571605e-06,
"loss": 0.1211,
"step": 1450
},
{
"epoch": 9.831432192648922,
"grad_norm": 2.8603129386901855,
"learning_rate": 6.261112869926348e-06,
"loss": 0.0645,
"step": 1455
},
{
"epoch": 9.865230249260668,
"grad_norm": 2.669579267501831,
"learning_rate": 6.187799768019134e-06,
"loss": 0.194,
"step": 1460
},
{
"epoch": 9.899028305872413,
"grad_norm": 2.163553237915039,
"learning_rate": 6.114725628666997e-06,
"loss": 0.1371,
"step": 1465
},
{
"epoch": 9.932826362484157,
"grad_norm": 3.211575984954834,
"learning_rate": 6.041895032422377e-06,
"loss": 0.1427,
"step": 1470
},
{
"epoch": 9.966624419095902,
"grad_norm": 3.1249096393585205,
"learning_rate": 5.969312544571529e-06,
"loss": 0.1482,
"step": 1475
},
{
"epoch": 10.0,
"grad_norm": 2.95405912399292,
"learning_rate": 5.8969827148483935e-06,
"loss": 0.1493,
"step": 1480
},
{
"epoch": 10.033798056611746,
"grad_norm": 2.1418049335479736,
"learning_rate": 5.824910077149372e-06,
"loss": 0.1223,
"step": 1485
},
{
"epoch": 10.06759611322349,
"grad_norm": 2.2330262660980225,
"learning_rate": 5.753099149249133e-06,
"loss": 0.1569,
"step": 1490
},
{
"epoch": 10.101394169835235,
"grad_norm": 2.517437696456909,
"learning_rate": 5.681554432517435e-06,
"loss": 0.0826,
"step": 1495
},
{
"epoch": 10.135192226446978,
"grad_norm": 2.317457675933838,
"learning_rate": 5.610280411636941e-06,
"loss": 0.1024,
"step": 1500
},
{
"epoch": 10.168990283058724,
"grad_norm": 3.2839527130126953,
"learning_rate": 5.539281554322126e-06,
"loss": 0.1484,
"step": 1505
},
{
"epoch": 10.20278833967047,
"grad_norm": 3.0793209075927734,
"learning_rate": 5.468562311039205e-06,
"loss": 0.1529,
"step": 1510
},
{
"epoch": 10.236586396282213,
"grad_norm": 2.524780035018921,
"learning_rate": 5.3981271147271786e-06,
"loss": 0.09,
"step": 1515
},
{
"epoch": 10.270384452893959,
"grad_norm": 2.0456202030181885,
"learning_rate": 5.327980380519942e-06,
"loss": 0.1159,
"step": 1520
},
{
"epoch": 10.304182509505704,
"grad_norm": 2.448542356491089,
"learning_rate": 5.25812650546955e-06,
"loss": 0.1431,
"step": 1525
},
{
"epoch": 10.337980566117448,
"grad_norm": 1.669090986251831,
"learning_rate": 5.188569868270566e-06,
"loss": 0.1234,
"step": 1530
},
{
"epoch": 10.371778622729193,
"grad_norm": 3.0153934955596924,
"learning_rate": 5.11931482898562e-06,
"loss": 0.1086,
"step": 1535
},
{
"epoch": 10.405576679340937,
"grad_norm": 3.3632757663726807,
"learning_rate": 5.050365728772084e-06,
"loss": 0.1114,
"step": 1540
},
{
"epoch": 10.439374735952683,
"grad_norm": 2.883791208267212,
"learning_rate": 4.981726889609952e-06,
"loss": 0.1465,
"step": 1545
},
{
"epoch": 10.473172792564428,
"grad_norm": 1.6629996299743652,
"learning_rate": 4.913402614030944e-06,
"loss": 0.0823,
"step": 1550
},
{
"epoch": 10.506970849176172,
"grad_norm": 2.789846658706665,
"learning_rate": 4.84539718484877e-06,
"loss": 0.133,
"step": 1555
},
{
"epoch": 10.540768905787917,
"grad_norm": 2.095916509628296,
"learning_rate": 4.77771486489071e-06,
"loss": 0.0988,
"step": 1560
},
{
"epoch": 10.574566962399661,
"grad_norm": 2.670482635498047,
"learning_rate": 4.710359896730379e-06,
"loss": 0.1166,
"step": 1565
},
{
"epoch": 10.608365019011407,
"grad_norm": 1.432079553604126,
"learning_rate": 4.643336502421783e-06,
"loss": 0.1624,
"step": 1570
},
{
"epoch": 10.642163075623152,
"grad_norm": 2.3370885848999023,
"learning_rate": 4.576648883234686e-06,
"loss": 0.1007,
"step": 1575
},
{
"epoch": 10.675961132234896,
"grad_norm": 3.077364921569824,
"learning_rate": 4.510301219391245e-06,
"loss": 0.095,
"step": 1580
},
{
"epoch": 10.709759188846641,
"grad_norm": 3.081515312194824,
"learning_rate": 4.444297669803981e-06,
"loss": 0.1086,
"step": 1585
},
{
"epoch": 10.743557245458387,
"grad_norm": 3.574352502822876,
"learning_rate": 4.378642371815078e-06,
"loss": 0.1501,
"step": 1590
},
{
"epoch": 10.77735530207013,
"grad_norm": 2.738147735595703,
"learning_rate": 4.313339440937055e-06,
"loss": 0.1719,
"step": 1595
},
{
"epoch": 10.811153358681876,
"grad_norm": 2.235377073287964,
"learning_rate": 4.248392970594774e-06,
"loss": 0.1176,
"step": 1600
},
{
"epoch": 10.84495141529362,
"grad_norm": 2.95943021774292,
"learning_rate": 4.18380703186886e-06,
"loss": 0.1334,
"step": 1605
},
{
"epoch": 10.878749471905365,
"grad_norm": 1.9108000993728638,
"learning_rate": 4.1195856732405094e-06,
"loss": 0.113,
"step": 1610
},
{
"epoch": 10.912547528517111,
"grad_norm": 2.856457233428955,
"learning_rate": 4.055732920337699e-06,
"loss": 0.1027,
"step": 1615
},
{
"epoch": 10.946345585128855,
"grad_norm": 2.5498857498168945,
"learning_rate": 3.992252775682877e-06,
"loss": 0.0869,
"step": 1620
},
{
"epoch": 10.9801436417406,
"grad_norm": 2.5696861743927,
"learning_rate": 3.929149218442052e-06,
"loss": 0.1553,
"step": 1625
},
{
"epoch": 11.013519222644698,
"grad_norm": 1.5783302783966064,
"learning_rate": 3.866426204175353e-06,
"loss": 0.1055,
"step": 1630
},
{
"epoch": 11.047317279256443,
"grad_norm": 2.1971595287323,
"learning_rate": 3.804087664589108e-06,
"loss": 0.1169,
"step": 1635
},
{
"epoch": 11.081115335868187,
"grad_norm": 2.1792209148406982,
"learning_rate": 3.742137507289363e-06,
"loss": 0.1408,
"step": 1640
},
{
"epoch": 11.114913392479933,
"grad_norm": 2.117349147796631,
"learning_rate": 3.680579615536961e-06,
"loss": 0.0973,
"step": 1645
},
{
"epoch": 11.148711449091678,
"grad_norm": 2.348695755004883,
"learning_rate": 3.6194178480041174e-06,
"loss": 0.0879,
"step": 1650
},
{
"epoch": 11.182509505703422,
"grad_norm": 2.529822826385498,
"learning_rate": 3.558656038532532e-06,
"loss": 0.1049,
"step": 1655
},
{
"epoch": 11.216307562315167,
"grad_norm": 1.6536489725112915,
"learning_rate": 3.4982979958930896e-06,
"loss": 0.0713,
"step": 1660
},
{
"epoch": 11.250105618926911,
"grad_norm": 3.6709718704223633,
"learning_rate": 3.4383475035471026e-06,
"loss": 0.0843,
"step": 1665
},
{
"epoch": 11.283903675538657,
"grad_norm": 2.0067543983459473,
"learning_rate": 3.378808319409149e-06,
"loss": 0.1148,
"step": 1670
},
{
"epoch": 11.317701732150402,
"grad_norm": 2.263753890991211,
"learning_rate": 3.319684175611517e-06,
"loss": 0.1042,
"step": 1675
},
{
"epoch": 11.351499788762146,
"grad_norm": 2.691466808319092,
"learning_rate": 3.2609787782702595e-06,
"loss": 0.0902,
"step": 1680
},
{
"epoch": 11.385297845373891,
"grad_norm": 2.7062034606933594,
"learning_rate": 3.2026958072528715e-06,
"loss": 0.0978,
"step": 1685
},
{
"epoch": 11.419095901985635,
"grad_norm": 2.082036256790161,
"learning_rate": 3.1448389159476433e-06,
"loss": 0.1192,
"step": 1690
},
{
"epoch": 11.45289395859738,
"grad_norm": 1.7839562892913818,
"learning_rate": 3.087411731034641e-06,
"loss": 0.1098,
"step": 1695
},
{
"epoch": 11.486692015209126,
"grad_norm": 2.078550100326538,
"learning_rate": 3.0304178522583626e-06,
"loss": 0.0822,
"step": 1700
},
{
"epoch": 11.52049007182087,
"grad_norm": 1.636839747428894,
"learning_rate": 2.973860852202117e-06,
"loss": 0.0987,
"step": 1705
},
{
"epoch": 11.554288128432615,
"grad_norm": 2.2059497833251953,
"learning_rate": 2.917744276064056e-06,
"loss": 0.1176,
"step": 1710
},
{
"epoch": 11.58808618504436,
"grad_norm": 2.563145637512207,
"learning_rate": 2.8620716414349714e-06,
"loss": 0.1158,
"step": 1715
},
{
"epoch": 11.621884241656105,
"grad_norm": 2.6411447525024414,
"learning_rate": 2.806846438077787e-06,
"loss": 0.1471,
"step": 1720
},
{
"epoch": 11.65568229826785,
"grad_norm": 1.4738620519638062,
"learning_rate": 2.7520721277088023e-06,
"loss": 0.1833,
"step": 1725
},
{
"epoch": 11.689480354879594,
"grad_norm": 1.8081343173980713,
"learning_rate": 2.697752143780713e-06,
"loss": 0.1188,
"step": 1730
},
{
"epoch": 11.72327841149134,
"grad_norm": 1.6308510303497314,
"learning_rate": 2.643889891267386e-06,
"loss": 0.0962,
"step": 1735
},
{
"epoch": 11.757076468103085,
"grad_norm": 2.0612504482269287,
"learning_rate": 2.5904887464504115e-06,
"loss": 0.0656,
"step": 1740
},
{
"epoch": 11.790874524714829,
"grad_norm": 2.3865137100219727,
"learning_rate": 2.537552056707483e-06,
"loss": 0.1124,
"step": 1745
},
{
"epoch": 11.824672581326574,
"grad_norm": 2.3273239135742188,
"learning_rate": 2.4850831403025597e-06,
"loss": 0.0682,
"step": 1750
},
{
"epoch": 11.858470637938318,
"grad_norm": 2.371812105178833,
"learning_rate": 2.433085286177872e-06,
"loss": 0.0906,
"step": 1755
},
{
"epoch": 11.892268694550063,
"grad_norm": 4.104214191436768,
"learning_rate": 2.381561753747753e-06,
"loss": 0.1273,
"step": 1760
},
{
"epoch": 11.926066751161809,
"grad_norm": 2.1697592735290527,
"learning_rate": 2.330515772694333e-06,
"loss": 0.1251,
"step": 1765
},
{
"epoch": 11.959864807773553,
"grad_norm": 3.299699068069458,
"learning_rate": 2.279950542765078e-06,
"loss": 0.0756,
"step": 1770
},
{
"epoch": 11.993662864385298,
"grad_norm": 3.481651544570923,
"learning_rate": 2.2298692335722403e-06,
"loss": 0.1518,
"step": 1775
},
{
"epoch": 12.027038445289396,
"grad_norm": 1.655312418937683,
"learning_rate": 2.1802749843941583e-06,
"loss": 0.084,
"step": 1780
},
{
"epoch": 12.060836501901141,
"grad_norm": 1.347791075706482,
"learning_rate": 2.1311709039784734e-06,
"loss": 0.0561,
"step": 1785
},
{
"epoch": 12.094634558512885,
"grad_norm": 1.9169631004333496,
"learning_rate": 2.0825600703472814e-06,
"loss": 0.1018,
"step": 1790
},
{
"epoch": 12.12843261512463,
"grad_norm": 1.5145456790924072,
"learning_rate": 2.0344455306041633e-06,
"loss": 0.1338,
"step": 1795
},
{
"epoch": 12.162230671736376,
"grad_norm": 1.8022133111953735,
"learning_rate": 1.98683030074321e-06,
"loss": 0.1331,
"step": 1800
},
{
"epoch": 12.19602872834812,
"grad_norm": 2.502906084060669,
"learning_rate": 1.939717365459952e-06,
"loss": 0.0758,
"step": 1805
},
{
"epoch": 12.229826784959865,
"grad_norm": 2.3254261016845703,
"learning_rate": 1.8931096779642644e-06,
"loss": 0.1571,
"step": 1810
},
{
"epoch": 12.263624841571609,
"grad_norm": 1.831810474395752,
"learning_rate": 1.847010159795265e-06,
"loss": 0.1052,
"step": 1815
},
{
"epoch": 12.297422898183354,
"grad_norm": 2.395282745361328,
"learning_rate": 1.8014217006381728e-06,
"loss": 0.057,
"step": 1820
},
{
"epoch": 12.3312209547951,
"grad_norm": 2.0812599658966064,
"learning_rate": 1.7563471581431623e-06,
"loss": 0.0743,
"step": 1825
},
{
"epoch": 12.365019011406844,
"grad_norm": 1.5092777013778687,
"learning_rate": 1.7117893577462541e-06,
"loss": 0.0733,
"step": 1830
},
{
"epoch": 12.39881706801859,
"grad_norm": 2.1698033809661865,
"learning_rate": 1.6677510924921958e-06,
"loss": 0.099,
"step": 1835
},
{
"epoch": 12.432615124630333,
"grad_norm": 2.5433080196380615,
"learning_rate": 1.6242351228593833e-06,
"loss": 0.0944,
"step": 1840
},
{
"epoch": 12.466413181242078,
"grad_norm": 2.1289961338043213,
"learning_rate": 1.5812441765868292e-06,
"loss": 0.0881,
"step": 1845
},
{
"epoch": 12.500211237853824,
"grad_norm": 1.7505559921264648,
"learning_rate": 1.5387809485031745e-06,
"loss": 0.065,
"step": 1850
},
{
"epoch": 12.534009294465568,
"grad_norm": 2.4067695140838623,
"learning_rate": 1.4968481003577628e-06,
"loss": 0.0476,
"step": 1855
},
{
"epoch": 12.567807351077313,
"grad_norm": 2.417130708694458,
"learning_rate": 1.4554482606538044e-06,
"loss": 0.1166,
"step": 1860
},
{
"epoch": 12.601605407689059,
"grad_norm": 1.8488808870315552,
"learning_rate": 1.4145840244835985e-06,
"loss": 0.1015,
"step": 1865
},
{
"epoch": 12.635403464300802,
"grad_norm": 2.4164412021636963,
"learning_rate": 1.3742579533658729e-06,
"loss": 0.0822,
"step": 1870
},
{
"epoch": 12.669201520912548,
"grad_norm": 2.083436965942383,
"learning_rate": 1.3344725750852183e-06,
"loss": 0.1192,
"step": 1875
},
{
"epoch": 12.702999577524292,
"grad_norm": 1.8657339811325073,
"learning_rate": 1.2952303835336256e-06,
"loss": 0.1488,
"step": 1880
},
{
"epoch": 12.736797634136037,
"grad_norm": 2.0696699619293213,
"learning_rate": 1.2565338385541792e-06,
"loss": 0.0752,
"step": 1885
},
{
"epoch": 12.770595690747783,
"grad_norm": 1.6756703853607178,
"learning_rate": 1.2183853657868504e-06,
"loss": 0.107,
"step": 1890
},
{
"epoch": 12.804393747359526,
"grad_norm": 2.409106731414795,
"learning_rate": 1.1807873565164507e-06,
"loss": 0.0669,
"step": 1895
},
{
"epoch": 12.838191803971272,
"grad_norm": 1.7135124206542969,
"learning_rate": 1.1437421675227457e-06,
"loss": 0.1809,
"step": 1900
},
{
"epoch": 12.871989860583017,
"grad_norm": 1.9362844228744507,
"learning_rate": 1.107252120932717e-06,
"loss": 0.1153,
"step": 1905
},
{
"epoch": 12.905787917194761,
"grad_norm": 1.499002456665039,
"learning_rate": 1.0713195040750012e-06,
"loss": 0.1103,
"step": 1910
},
{
"epoch": 12.939585973806507,
"grad_norm": 2.998647689819336,
"learning_rate": 1.035946569336519e-06,
"loss": 0.089,
"step": 1915
},
{
"epoch": 12.97338403041825,
"grad_norm": 2.5154922008514404,
"learning_rate": 1.0011355340212802e-06,
"loss": 0.1253,
"step": 1920
},
{
"epoch": 13.00675961132235,
"grad_norm": 2.583174467086792,
"learning_rate": 9.668885802114002e-07,
"loss": 0.0991,
"step": 1925
},
{
"epoch": 13.040557667934094,
"grad_norm": 2.039463758468628,
"learning_rate": 9.33207854630317e-07,
"loss": 0.063,
"step": 1930
},
{
"epoch": 13.074355724545839,
"grad_norm": 2.5090322494506836,
"learning_rate": 9.000954685082286e-07,
"loss": 0.0839,
"step": 1935
},
{
"epoch": 13.108153781157583,
"grad_norm": 1.2274693250656128,
"learning_rate": 8.675534974497435e-07,
"loss": 0.0393,
"step": 1940
},
{
"epoch": 13.141951837769328,
"grad_norm": 1.7376823425292969,
"learning_rate": 8.355839813037936e-07,
"loss": 0.0899,
"step": 1945
},
{
"epoch": 13.175749894381074,
"grad_norm": 1.657383918762207,
"learning_rate": 8.041889240357493e-07,
"loss": 0.0883,
"step": 1950
},
{
"epoch": 13.209547950992818,
"grad_norm": 2.177140235900879,
"learning_rate": 7.733702936018162e-07,
"loss": 0.0703,
"step": 1955
},
{
"epoch": 13.243346007604563,
"grad_norm": 1.9505295753479004,
"learning_rate": 7.431300218256754e-07,
"loss": 0.0734,
"step": 1960
},
{
"epoch": 13.277144064216307,
"grad_norm": 1.5833914279937744,
"learning_rate": 7.13470004277379e-07,
"loss": 0.0464,
"step": 1965
},
{
"epoch": 13.310942120828052,
"grad_norm": 2.343639850616455,
"learning_rate": 6.843921001545429e-07,
"loss": 0.0652,
"step": 1970
},
{
"epoch": 13.344740177439798,
"grad_norm": 2.3044393062591553,
"learning_rate": 6.558981321658009e-07,
"loss": 0.1476,
"step": 1975
},
{
"epoch": 13.378538234051542,
"grad_norm": 1.9348516464233398,
"learning_rate": 6.279898864165423e-07,
"loss": 0.0909,
"step": 1980
},
{
"epoch": 13.412336290663287,
"grad_norm": 2.0757150650024414,
"learning_rate": 6.006691122969644e-07,
"loss": 0.1158,
"step": 1985
},
{
"epoch": 13.446134347275033,
"grad_norm": 1.7569150924682617,
"learning_rate": 5.739375223724108e-07,
"loss": 0.0857,
"step": 1990
},
{
"epoch": 13.479932403886776,
"grad_norm": 2.729357957839966,
"learning_rate": 5.477967922760141e-07,
"loss": 0.1197,
"step": 1995
},
{
"epoch": 13.513730460498522,
"grad_norm": 1.9830466508865356,
"learning_rate": 5.222485606036709e-07,
"loss": 0.0667,
"step": 2000
},
{
"epoch": 13.547528517110266,
"grad_norm": 2.4005234241485596,
"learning_rate": 4.972944288113268e-07,
"loss": 0.1217,
"step": 2005
},
{
"epoch": 13.581326573722011,
"grad_norm": 2.361483335494995,
"learning_rate": 4.729359611145845e-07,
"loss": 0.11,
"step": 2010
},
{
"epoch": 13.615124630333757,
"grad_norm": 1.8319944143295288,
"learning_rate": 4.49174684390663e-07,
"loss": 0.0716,
"step": 2015
},
{
"epoch": 13.6489226869455,
"grad_norm": 1.7428772449493408,
"learning_rate": 4.260120880826768e-07,
"loss": 0.1552,
"step": 2020
},
{
"epoch": 13.682720743557246,
"grad_norm": 2.0641117095947266,
"learning_rate": 4.034496241062824e-07,
"loss": 0.1185,
"step": 2025
},
{
"epoch": 13.71651880016899,
"grad_norm": 2.4367544651031494,
"learning_rate": 3.8148870675866145e-07,
"loss": 0.1445,
"step": 2030
},
{
"epoch": 13.750316856780735,
"grad_norm": 1.3618732690811157,
"learning_rate": 3.601307126298648e-07,
"loss": 0.0579,
"step": 2035
},
{
"epoch": 13.78411491339248,
"grad_norm": 1.515920877456665,
"learning_rate": 3.3937698051653034e-07,
"loss": 0.0543,
"step": 2040
},
{
"epoch": 13.817912970004224,
"grad_norm": 2.7224481105804443,
"learning_rate": 3.1922881133795827e-07,
"loss": 0.0955,
"step": 2045
},
{
"epoch": 13.85171102661597,
"grad_norm": 2.9664154052734375,
"learning_rate": 2.996874680545603e-07,
"loss": 0.1153,
"step": 2050
},
{
"epoch": 13.885509083227713,
"grad_norm": 2.224506139755249,
"learning_rate": 2.8075417558870333e-07,
"loss": 0.1311,
"step": 2055
},
{
"epoch": 13.919307139839459,
"grad_norm": 1.8828747272491455,
"learning_rate": 2.624301207479185e-07,
"loss": 0.1198,
"step": 2060
},
{
"epoch": 13.953105196451205,
"grad_norm": 2.1472909450531006,
"learning_rate": 2.447164521505074e-07,
"loss": 0.0764,
"step": 2065
},
{
"epoch": 13.986903253062948,
"grad_norm": 2.1299331188201904,
"learning_rate": 2.276142801535486e-07,
"loss": 0.1228,
"step": 2070
},
{
"epoch": 14.020278833967048,
"grad_norm": 2.9883980751037598,
"learning_rate": 2.1112467678329197e-07,
"loss": 0.1373,
"step": 2075
},
{
"epoch": 14.054076890578791,
"grad_norm": 2.33616304397583,
"learning_rate": 1.9524867566795945e-07,
"loss": 0.0636,
"step": 2080
},
{
"epoch": 14.087874947190537,
"grad_norm": 1.3360828161239624,
"learning_rate": 1.7998727197295785e-07,
"loss": 0.0624,
"step": 2085
},
{
"epoch": 14.12167300380228,
"grad_norm": 1.6282477378845215,
"learning_rate": 1.6534142233849527e-07,
"loss": 0.0585,
"step": 2090
},
{
"epoch": 14.155471060414026,
"grad_norm": 1.8694970607757568,
"learning_rate": 1.5131204481961592e-07,
"loss": 0.0747,
"step": 2095
},
{
"epoch": 14.189269117025772,
"grad_norm": 1.441935420036316,
"learning_rate": 1.3790001882865056e-07,
"loss": 0.0718,
"step": 2100
},
{
"epoch": 14.223067173637515,
"grad_norm": 2.2334372997283936,
"learning_rate": 1.251061850800961e-07,
"loss": 0.0947,
"step": 2105
},
{
"epoch": 14.256865230249261,
"grad_norm": 2.6375110149383545,
"learning_rate": 1.1293134553791551e-07,
"loss": 0.1348,
"step": 2110
},
{
"epoch": 14.290663286861005,
"grad_norm": 2.5394539833068848,
"learning_rate": 1.0137626336526596e-07,
"loss": 0.1289,
"step": 2115
},
{
"epoch": 14.32446134347275,
"grad_norm": 1.4307893514633179,
"learning_rate": 9.044166287666134e-08,
"loss": 0.0499,
"step": 2120
},
{
"epoch": 14.358259400084496,
"grad_norm": 1.6300568580627441,
"learning_rate": 8.012822949256981e-08,
"loss": 0.1074,
"step": 2125
},
{
"epoch": 14.39205745669624,
"grad_norm": 1.95559823513031,
"learning_rate": 7.043660969645261e-08,
"loss": 0.0932,
"step": 2130
},
{
"epoch": 14.425855513307985,
"grad_norm": 2.011991024017334,
"learning_rate": 6.136741099423416e-08,
"loss": 0.1045,
"step": 2135
},
{
"epoch": 14.45965356991973,
"grad_norm": 2.210416078567505,
"learning_rate": 5.2921201876223737e-08,
"loss": 0.1052,
"step": 2140
},
{
"epoch": 14.493451626531474,
"grad_norm": 2.3784687519073486,
"learning_rate": 4.5098511781485056e-08,
"loss": 0.0797,
"step": 2145
},
{
"epoch": 14.52724968314322,
"grad_norm": 2.040710687637329,
"learning_rate": 3.789983106464057e-08,
"loss": 0.121,
"step": 2150
},
{
"epoch": 14.561047739754963,
"grad_norm": 1.7879664897918701,
"learning_rate": 3.132561096514164e-08,
"loss": 0.0773,
"step": 2155
},
{
"epoch": 14.594845796366709,
"grad_norm": 1.8894425630569458,
"learning_rate": 2.5376263578977823e-08,
"loss": 0.0965,
"step": 2160
},
{
"epoch": 14.628643852978454,
"grad_norm": 1.9897544384002686,
"learning_rate": 2.0052161832850858e-08,
"loss": 0.078,
"step": 2165
},
{
"epoch": 14.662441909590198,
"grad_norm": 1.689963698387146,
"learning_rate": 1.5353639460793378e-08,
"loss": 0.1024,
"step": 2170
},
{
"epoch": 14.696239966201944,
"grad_norm": 1.8147411346435547,
"learning_rate": 1.1280990983248975e-08,
"loss": 0.1124,
"step": 2175
},
{
"epoch": 14.730038022813687,
"grad_norm": 2.499868392944336,
"learning_rate": 7.834471688616952e-09,
"loss": 0.1116,
"step": 2180
},
{
"epoch": 14.763836079425433,
"grad_norm": 1.9060348272323608,
"learning_rate": 5.014297617242925e-09,
"loss": 0.0975,
"step": 2185
},
{
"epoch": 14.797634136037178,
"grad_norm": 2.312739849090576,
"learning_rate": 2.8206455478774206e-09,
"loss": 0.101,
"step": 2190
},
{
"epoch": 14.831432192648922,
"grad_norm": 1.106972098350525,
"learning_rate": 1.2536529866014058e-09,
"loss": 0.0569,
"step": 2195
},
{
"epoch": 14.865230249260668,
"grad_norm": 1.9655669927597046,
"learning_rate": 3.1341815819763146e-10,
"loss": 0.1144,
"step": 2200
},
{
"epoch": 14.899028305872413,
"grad_norm": 2.5896451473236084,
"learning_rate": 0.0,
"loss": 0.0896,
"step": 2205
},
{
"epoch": 14.899028305872413,
"step": 2205,
"total_flos": 3.8395126284519014e+17,
"train_loss": 0.4484830284334905,
"train_runtime": 10885.404,
"train_samples_per_second": 3.262,
"train_steps_per_second": 0.203
}
],
"logging_steps": 5,
"max_steps": 2205,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.8395126284519014e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}